In [23]:
import pandas as pd

In [24]:
X_tfidf = pd.read_csv('X_tfidf.csv')
split_data = pd.read_csv('split_data_with_sentiments.csv')

In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
X = pca.fit_transform(X_tfidf)
y = split_data['ttype']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)

(3993, 5)
(999, 5)


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

logis = LogisticRegression().fit(X_train,y_train)
logis_pred = logis.predict(X_test)
print(classification_report(y_test, logis_pred))
print("roc-auc score:",roc_auc_score(y_test, logis_pred))

              precision    recall  f1-score   support

          -1       0.57      0.28      0.38       484
           1       0.54      0.80      0.65       515

    accuracy                           0.55       999
   macro avg       0.56      0.54      0.51       999
weighted avg       0.56      0.55      0.52       999

roc-auc score: 0.540495867768595


In [28]:
from sklearn.svm import LinearSVC
linearSVC= LinearSVC().fit(X_train, y_train)
linearSVC_pred = linearSVC.predict(X_test)
print(classification_report(y_test, linearSVC_pred))
print(roc_auc_score(y_test, linearSVC_pred))

              precision    recall  f1-score   support

          -1       0.58      0.27      0.37       484
           1       0.54      0.81      0.65       515

    accuracy                           0.55       999
   macro avg       0.56      0.54      0.51       999
weighted avg       0.56      0.55      0.52       999

0.5432219369333227


In [29]:
from sklearn.svm import SVC
svc = SVC().fit(X_train, y_train)
svc_pred = svc.predict(X_test)
print(classification_report(y_test, svc_pred))
print(roc_auc_score(y_test, svc_pred))

              precision    recall  f1-score   support

          -1       0.57      0.34      0.43       484
           1       0.55      0.76      0.64       515

    accuracy                           0.56       999
   macro avg       0.56      0.55      0.53       999
weighted avg       0.56      0.56      0.54       999

0.5500040118751504


In [30]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5)
clf.fit(X_train, y_train)
clf_pred = clf.predict(X_test)
print(classification_report(y_test,clf_pred))
print(roc_auc_score(y_test, clf_pred))

              precision    recall  f1-score   support

          -1       0.53      0.67      0.59       484
           1       0.59      0.44      0.51       515

    accuracy                           0.55       999
   macro avg       0.56      0.56      0.55       999
weighted avg       0.56      0.55      0.55       999

0.5570408408890315


In [31]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=42).fit(X_train, y_train)
clf_pred = clf.predict(X_test)
print(classification_report(y_test, clf_pred))
print(roc_auc_score(y_test, clf_pred))

              precision    recall  f1-score   support

          -1       0.54      0.48      0.51       484
           1       0.56      0.61      0.58       515

    accuracy                           0.55       999
   macro avg       0.55      0.55      0.54       999
weighted avg       0.55      0.55      0.55       999

0.5455568482708818


In [32]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
knc_pred = knc.predict(X_test)
print(classification_report(y_test, knc_pred))
print(roc_auc_score(y_test, knc_pred))

              precision    recall  f1-score   support

          -1       0.51      0.53      0.52       484
           1       0.55      0.53      0.54       515

    accuracy                           0.53       999
   macro avg       0.53      0.53      0.53       999
weighted avg       0.53      0.53      0.53       999

0.5296357217363395


In [33]:
from sklearn.naive_bayes import GaussianNB
gauss = GaussianNB().fit(X_train, y_train)
gauss_pred = gauss.predict(X_test)
print(classification_report(y_test, gauss_pred))
print(roc_auc_score(y_test, gauss_pred))

              precision    recall  f1-score   support

          -1       0.54      0.15      0.23       484
           1       0.52      0.88      0.66       515

    accuracy                           0.53       999
   macro avg       0.53      0.51      0.44       999
weighted avg       0.53      0.53      0.45       999

0.5141238064671427


In [34]:
from sklearn.model_selection import KFold

# define number of splits for cross-validation
k = 5

# initialize k-fold cross-validation
kf = KFold(n_splits=k, shuffle=True)

# initialize a list to store the model's accuracy for each fold
accuracies = []

# loop over each fold
for train_index, test_index in kf.split(X):
    # split the data into training and testing sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # initialize and train the model on the training set
    model = RandomForestClassifier(max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # evaluate the model on the testing set and append the accuracy to the list
    accuracy = model.score(X_test, y_test)
    accuracies.append(accuracy)

# compute the mean accuracy across all folds
mean_accuracy = sum(accuracies) / len(accuracies)

# print the mean accuracy
print("Mean accuracy: ", mean_accuracy)

Mean accuracy:  0.5582967737276354


In [35]:
from sklearn.model_selection import GridSearchCV
# Define the hyperparameters to search over
param = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 4, 5, 6, 7], 'min_samples_split': [2, 3, 4, 5]}

# Define the Decision Tree Classifier
model2 = DecisionTreeClassifier(random_state=42)

# Define the GridSearchCV object
g_model2 = GridSearchCV(model2, param_grid=param,scoring='accuracy')

g_model2.fit(X_train, y_train)
g_model2_pred = g_model2.predict(X_test)

print(classification_report(y_test,g_model2_pred))
print("Best parameters:", g_model2.best_params_)
print("Best score:", g_model2.best_score_)

              precision    recall  f1-score   support

          -1       0.56      0.44      0.49       490
           1       0.55      0.67      0.61       508

    accuracy                           0.56       998
   macro avg       0.56      0.56      0.55       998
weighted avg       0.56      0.56      0.55       998

Best parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 2}
Best score: 0.5548329522178412
