In [None]:
# _**Import Package**_

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# _**Load Data**_

data = pd.read_csv('/content/final_en.csv')

data

# _**Word Cloud**_

from wordcloud import WordCloud, STOPWORDS

### _**word cloud of data[title]**_

text = " ".join(i for i in data.title)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### _**word cloud of data[text]**_

text = " ".join(i for i in data.text)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# _**TF-IDF**_
# $$w_{x,y} = tf_{x,y} \times \log {N / df_x}$$

from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVectorizer().fit_transform(data['title'])

title = TfidfVectorizer().fit_transform(data['title'].str.lower()).toarray()

title.shape

text_2 = TfidfVectorizer().fit_transform(data['text'].str.lower()).toarray()

text_2.shape

# _**Dimensionality Reduction**_

from sklearn.decomposition import PCA
from sklearn.manifold import Isomap, MDS, TSNE

pca = PCA(n_components=1500)

pca.fit(title)

title_1500 = pca.transform(title)
np.save('title_pca_1500.npy', title_1500)

title_reduction = np.load('title_pca_1500.npy')

pca = PCA(n_components=1500)

pca.fit(text_2)

text_1500 = pca.transform(text_2)
np.save('text_pca_1500.npy', text_1500)

text_reduction = np.load('text_pca_1500.npy')

# _**Creat DataFrame**_

df = np.hstack((title_reduction, text_reduction))

df.shape

#_**Train-Test-Split Data**_

from sklearn.model_selection import train_test_split

X = df
y = data['lebel']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# _**Classificatin**_

### _**1. Logistic Regression**_

from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(solver='lbfgs')

lgr.fit(X_train, y_train)

y_pred_lgr = lgr.predict(X_test)

### _**2. SVC**_

from sklearn.svm import SVC

svc = SVC(kernel='rbf')

svc.fit(X_train, y_train)

y_pred_svc = svc.predict(X_test)

### _**3. Decision Tree**_

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as DTC

dtc = DTC(criterion='entropy', max_depth=4)

dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test)

### _**4. AdaBoost**_

from sklearn.ensemble import AdaBoostClassifier as ABC

abc = ABC(n_estimators=10, estimator=DTC(criterion='entropy', max_depth=4))

abc.fit(X_train, y_train)

y_pred_abc = abc.predict(X_test)

### _**5. Random Forest**_

from sklearn.ensemble import RandomForestClassifier as RFC

rfc = RFC(n_estimators=150, criterion='entropy', max_depth=4)

rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)

### _**6. Naive Bayes**_

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train, y_train)

y_pred_gnb = gnb.predict(X_test)

### _**7. KNN**_

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=6, metric='minkowski')

knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

# _**Metrics**_

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

print(accuracy_score(y_true=y_test, y_pred=y_pred_lgr))
print(f1_score(y_true=y_test, y_pred=y_pred_lgr))
print(precision_score(y_true=y_test, y_pred=y_pred_lgr))
print(recall_score(y_true=y_test, y_pred=y_pred_lgr))
print(confusion_matrix(y_true=y_test, y_pred=y_pred_lgr))

print(accuracy_score(y_true=y_test, y_pred=y_pred_svc))
print(f1_score(y_true=y_test, y_pred=y_pred_svc))
print(precision_score(y_true=y_test, y_pred=y_pred_svc))
print(recall_score(y_true=y_test, y_pred=y_pred_svc))
print(confusion_matrix(y_true=y_test, y_pred=y_pred_svc))

print(accuracy_score(y_true=y_test, y_pred=y_pred_dtc))
print(f1_score(y_true=y_test, y_pred=y_pred_dtc))
print(precision_score(y_true=y_test, y_pred=y_pred_dtc))
print(recall_score(y_true=y_test, y_pred=y_pred_dtc))
print(confusion_matrix(y_true=y_test, y_pred=y_pred_dtc))

print(accuracy_score(y_true=y_test, y_pred=y_pred_abc))
print(f1_score(y_true=y_test, y_pred=y_pred_abc))
print(precision_score(y_true=y_test, y_pred=y_pred_abc))
print(recall_score(y_true=y_test, y_pred=y_pred_abc))
print(confusion_matrix(y_true=y_test, y_pred=y_pred_abc))

print(accuracy_score(y_true=y_test, y_pred=y_pred_rfc))
print(f1_score(y_true=y_test, y_pred=y_pred_rfc))
print(precision_score(y_true=y_test, y_pred=y_pred_rfc))
print(recall_score(y_true=y_test, y_pred=y_pred_rfc))
print(confusion_matrix(y_true=y_test, y_pred=y_pred_rfc))

print(accuracy_score(y_true=y_test, y_pred=y_pred_gnb))
print(f1_score(y_true=y_test, y_pred=y_pred_gnb))
print(precision_score(y_true=y_test, y_pred=y_pred_gnb))
print(recall_score(y_true=y_test, y_pred=y_pred_gnb))
print(confusion_matrix(y_true=y_test, y_pred=y_pred_gnb))

print(accuracy_score(y_true=y_test, y_pred=y_pred_knn))
print(f1_score(y_true=y_test, y_pred=y_pred_knn))
print(precision_score(y_true=y_test, y_pred=y_pred_knn))
print(recall_score(y_true=y_test, y_pred=y_pred_knn))
print(confusion_matrix(y_true=y_test, y_pred=y_pred_knn))

# _**Visualization**_

plt.figure(figsize=(10, 5))
x = ['Logistic Regression', 'SVC', 'Decision Tree', 'Ada Boost', 'Random Forest', 'Naive Bayes', 'KNN']
y = [accuracy_score(y_true=y_test, y_pred=y_pred_lgr), accuracy_score(y_true=y_test, y_pred=y_pred_svc),
     accuracy_score(y_true=y_test, y_pred=y_pred_dtc), accuracy_score(y_true=y_test, y_pred=y_pred_abc),
     accuracy_score(y_true=y_test, y_pred=y_pred_rfc), accuracy_score(y_true=y_test, y_pred=y_pred_gnb),
     accuracy_score(y_true=y_test, y_pred=y_pred_knn)]
yy = [f1_score(y_true=y_test, y_pred=y_pred_lgr), f1_score(y_true=y_test, y_pred=y_pred_svc),
     f1_score(y_true=y_test, y_pred=y_pred_dtc), f1_score(y_true=y_test, y_pred=y_pred_abc),
     f1_score(y_true=y_test, y_pred=y_pred_rfc), f1_score(y_true=y_test, y_pred=y_pred_gnb),
     f1_score(y_true=y_test, y_pred=y_pred_knn)]
yyy = [precision_score(y_true=y_test, y_pred=y_pred_lgr), precision_score(y_true=y_test, y_pred=y_pred_svc),
     precision_score(y_true=y_test, y_pred=y_pred_dtc), precision_score(y_true=y_test, y_pred=y_pred_abc),
     precision_score(y_true=y_test, y_pred=y_pred_rfc), precision_score(y_true=y_test, y_pred=y_pred_gnb),
     precision_score(y_true=y_test, y_pred=y_pred_knn)]
yyyy = [recall_score(y_true=y_test, y_pred=y_pred_lgr), recall_score(y_true=y_test, y_pred=y_pred_svc),
     recall_score(y_true=y_test, y_pred=y_pred_dtc), recall_score(y_true=y_test, y_pred=y_pred_abc),
     recall_score(y_true=y_test, y_pred=y_pred_rfc), recall_score(y_true=y_test, y_pred=y_pred_gnb),
     recall_score(y_true=y_test, y_pred=y_pred_knn)]
plt.plot(x, y, label= "accuracy_score", color= "r", marker= "*")
plt.plot(x, yy, label= "f1_score", color= "g", marker="o")
plt.plot(x, yyy, label= "precision_score", color= "y", marker="s")
plt.plot(x, yyyy, label= "recall_score", color= "b", marker="v")
plt.xlabel('Classifier')
plt.ylabel('Metric Value')
plt.title('Comparison of Metrics across Classifier')
plt.legend()
plt.show()

## _**Confusion Matrix**_

from sklearn.metrics import ConfusionMatrixDisplay

plt.figure(figsize=(8, 8))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true=y_test, y_pred=y_pred_lgr),

                              display_labels=lgr.classes_)

disp.plot()
plt.title('Logistic Regression')

disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_true=y_test, y_pred=y_pred_svc),

                              display_labels=svc.classes_)

disp.plot()
plt.title('SVC')

