In [1]:
### Basic Imports
import os
import pandas as pd
import numpy as np
from itertools import combinations

### Graphic libraries
import matplotlib.pyplot as plt
import seaborn as sns 

### Importing text processing packages
from nltk.corpus import PlaintextCorpusReader
from nltk import word_tokenize
from valuable_features import *

### Importing the relevant ML libraries 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import xgboost as xgb
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,roc_curve,auc,recall_score,f1_score,precision_score,classification_report,confusion_matrix,auc
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import resample

# Creating features for comment (if necessary)

## Read in training and validation data first

In [None]:
path_parent = os.path.dirname(os.path.dirname(os.getcwd()))
training_data_path = f'{path_parent}/Data/Thoughtful Comments/thoughtful_comments_labelled.csv'
validation_data_path = f'{path_parent}/Data/Thoughtful Comments/validation_comments.csv'

## Create features
### Function for creating features in valuable_features.py

In [None]:
training_features_created = create_features(training_data_path)
training_features_created.to_csv(f'{path_parent}/Data/Thoughtful Comments/thoughtful_comments_final.csv')

validation_features_created = create_features(validation_data_path)
validation_features_created.to_csv(f'{path_parent}/Data/Thoughtful Comments/validation_comments_final.csv')

# Reading training and validation data after features creation for EDA and modelling

In [None]:
path_parent = os.path.dirname(os.path.dirname(os.getcwd()))
training_df = pd.read_csv(f'{path_parent}/Data/Thoughtful Comments/thoughtful_comments_final.csv')
validation_df = pd.read_csv(f'{path_parent}/Data/Thoughtful Comments/validation_comments_final.csv')

# EDA

In [None]:
training_df.describe()

In [None]:
training_df[['Length Category']].value_counts()
training_df[['Relevance Score Category']].value_counts()

## Plotting boxplots for all features to see if there is significant difference between valuable/not valuable comments

### Feature 1 (Length)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Length', x='Thoughtful?', ax=ax).set_title('Boxplot of Length (with outliers)', fontsize=20)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Length', x='Thoughtful?', ax=ax, showfliers=False).set_title('Boxplot of Length (without outliers)', fontsize=20)

### Feature 2 (Comment Likelihood)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Average Loglikelihood', x='Thoughtful?', ax=ax).set_title('Boxplot of Comment Likelihood (with outliers)', fontsize=20)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Average Loglikelihood', x='Thoughtful?', ax=ax, showfliers=False).set_title('Boxplot of Comment Likelihood (without outliers)', fontsize=20)

### Feature 3 (Num Verbs)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Num Verbs', x='Thoughtful?', ax=ax).set_title('Boxplot of Number of Verbs (with outliers)', fontsize=20)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Num Verbs', x='Thoughtful?', ax=ax, showfliers=False).set_title('Boxplot of Number of Verbs (without outliers)', fontsize=20)

### Feature 4 (Num Discourse Relations)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Num Discourse Relations', x='Thoughtful?', ax=ax).set_title('Boxplot of Number of Discourse Relations (with outliers)', fontsize=20)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Num Discourse Relations', x='Thoughtful?', ax=ax, showfliers=False).set_title('Boxplot of Number of Discourse Relations (without outliers)', fontsize=20)

### Feature 5 (Relevance Score)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Relevance score', x='Thoughtful?', ax=ax).set_title('Boxplot of Relevance Score (with outliers)', fontsize=20)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Relevance score', x='Thoughtful?', ax=ax, showfliers=False).set_title('Boxplot of Relevance Score (without outliers)', fontsize=20)

### Feature 6 (Num Pronouns)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Num Pronouns', x='Thoughtful?', ax=ax).set_title('Boxplot of Num Pronouns (with outliers)', fontsize=20)

In [None]:
sns.set_theme(style='whitegrid')
fig, ax = plt.subplots()
fig.set_size_inches(10, 8)
sns.boxplot(data=training_df, y='Num Pronouns', x='Thoughtful?', ax=ax, showfliers=False).set_title('Boxplot of Num Pronouns (without outliers)', fontsize=20)

## Looking at distributions of independent variables

In [None]:
features_list = ['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']
target = 'Thoughtful?'

# Function to plot distribution graphs
def eda_plots(df,features_list,plot_title,fig_size):

    df[features_list].hist(bins=40,edgecolor='b',linewidth=1.0,xlabelsize=8,ylabelsize=8,grid= False,figsize=fig_size ,color='red')

    plt.tight_layout(rect=(0,0,1.2,1.2))

    # Overall title for all the plots
    plt.suptitle(plot_title,x=0.65,y=1.25,fontsize=14)

    plt.show()

In [None]:
eda_plots(df,features_list,'Features Univariate Plots',(20,7))

In [None]:
eda_plots(df,target,'Target Variable Univariate Plots',(5,5))

# Standardizing independent variables

In [None]:
scaler = StandardScaler()

training_X_scaled = scaler.fit_transform(training_df[features_list])
training_X_scaled = pd.DataFrame(training_X_scaled, columns=features_list)

validation_X_scaled = scaler.fit_transform(validation_df[features_list])
validation_X_scaled = pd.DataFrame(validation_X_scaled, columns=features_list)

# Feature importance (using RandomForestClassifier)

In [None]:
X = training_X_scaled[['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']]
y = training_df['Thoughtful?']
feat_labels = ['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
sel_rfc = SelectFromModel(RandomForestClassifier(n_estimators = 500, random_state=10))
sel_rfc.fit(X_train, y_train)

In [None]:
selected_feat= X_train.columns[(sel_rfc.get_support())]
selected_feat

In [None]:
clf = RandomForestClassifier(n_estimators=500, random_state=10)
clf.fit(X_train, y_train)

for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

# Modelling

## Creating a dictionary for all the classification models

In [None]:
models_dict = {
    'Logistic Regression':LogisticRegression(),
    'Naive Bayes':GaussianNB(),
    # 'Decision Trees':DecisionTreeClassifier(),
    'SVM linear': svm.SVC(kernel='linear', probability=True),
    'SVM rbf': svm.SVC(kernel='rbf', probability=True),
    'Random Forest': RandomForestClassifier(n_estimators = 500, random_state=10),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False)
}

## Train test split on training data

In [None]:
# Creating our independent and dependent variables df
X = training_X_scaled[['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']]
y = training_df['Thoughtful?']

# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=15) 

# Training with differnent models
for model_name in models_dict:
    m = models_dict[model_name]
    
    m.fit(X_train, y_train)
    predictions = m.predict(X_test)

    acc = accuracy_score(y_test,predictions)  # always true label first, then your predicted labels
    precision = precision_score(y_test,predictions) 
    recall = recall_score(y_test,predictions) 
    f1 = f1_score(y_test,predictions)

    print(model_name)
    print('-'*50)
    print('Accuracy Score for {} is {:.5f}'.format(model_name,acc))
    print('Precision Score for {} is {:.5f}'.format(model_name,precision))
    print('Recall Score for {} is {:.5f}'.format(model_name,recall))
    print('F1 Score for {} is {:.5f}'.format(model_name,f1))
    print()

## Stratified K-fold Cross Validation

In [None]:
skf = StratifiedKFold(n_splits=10)
target = training_df['Thoughtful?']

In [None]:
X = ['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']
y = ['Thoughtful?']

model = GaussianNB() # can change the model here

fold_no = 1
for train_index, test_index in skf.split(df, target):
    train = df.loc[train_index,:]
    test = df.loc[test_index,:]
    print('Fold', str(fold_no), 'Class Ratio:', sum(test['Thoughtful?'])/len(test['Thoughtful?']))
    
    X_train = train[X]
    y_train = train[y]
    X_test = test[X]
    y_test = test[y]
    model.fit(X_train, y_train.values.ravel())
    predictions = model.predict(X_test)
    print('Fold', str(fold_no), 'F1 score:', f1_score(y_test,predictions))
    print()

    fold_no += 1

In [None]:
X = training_X_scaled[['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']]
y = training_df[['Thoughtful?']]

for model_name in models_dict:
    m = models_dict[model_name]
    scoring = ['accuracy', 'precision', 'recall', 'f1']
    scores = cross_validate(m, X.values, y.values.ravel(), scoring=scoring, cv=10)

    print(model_name)
    print('-'*50)
    print(f"Mean accuracy is {scores['test_accuracy'].mean()}")
    print(f"Mean precision is {scores['test_precision'].mean()}")
    print(f"Mean recall is {scores['test_recall'].mean()}")
    print(f"Mean f1 is {scores['test_f1'].mean()}")
    print()
    print()

## Test against validation data with the best model after stratified k-fold cross validation

In [None]:
# Creating our independent and dependent variables df
X_train = training_X_scaled[['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']]
y_train = training_df[['Thoughtful?']]
X_test = validation_X_scaled[['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']]
y_test = validation_df[['Thoughtful?']]

# Training with differnent models
for model_name in models_dict:
    m = models_dict[model_name]

    # X_train = training_data[X]
    # y_train = training_data[y]
    # X_test = validation_data[X]
    # y_test = validation_data[y]
    
    m.fit(X_train, y_train.values.ravel())
    predictions = m.predict(X_test)

    acc = accuracy_score(y_test,predictions)  # always true label first, then your predicted labels!
    precision = precision_score(y_test,predictions) 
    recall = recall_score(y_test,predictions) 
    f1 = f1_score(y_test,predictions)

    print(model_name)
    print('-'*50)
    print('Accuracy Score for {} is {:.5f}'.format(model_name,acc))
    print('Precision Score for {} is {:.5f}'.format(model_name,precision))
    print('Recall Score for {} is {:.5f}'.format(model_name,recall))
    print('F1 Score for {} is {:.5f}'.format(model_name,f1))
    print()

### ROC score

In [None]:
model = GaussianNB() #change this based on the best model performance
prob = model.predict_proba(X_test)
prob = [p[1] for p in prob_no_unsampled]
print(roc_auc_score(y_test, prob_no_unsampled))

In [None]:
lr_fpr, lr_tpr, _ = roc_curve(y_test, prob)
plt.plot(lr_fpr, lr_tpr, marker='.', label='SVM (RBF kernal)') #change the label to the model with the best performance
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

## Perform upsampling method and model again

In [None]:
training_thoughtful = training_df[training_df['Thoughtful?'] == 1]
training_unthoughtful = training_df[training_df['Thoughtful?'] == 0]

scaler = StandardScaler()
features_list = ['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']

training_thoughtful_X_scaled = scaler.fit_transform(training_thoughtful[features_list])
training_thoughtful_X_scaled = pd.DataFrame(training_thoughtful_X_scaled, columns=features_list)
training_unthoughtful_X_scaled = scaler.fit_transform(training_unthoughtful[features_list])
training_unthoughtful_X_scaled = pd.DataFrame(training_unthoughtful_X_scaled, columns=features_list)

training_thoughtful_X_scaled['Thoughtful?'] = training_thoughtful['Thoughtful?']
training_unthoughtful_X_scaled['Thoughtful?'] = training_unthoughtful['Thoughtful?']

print(len(training_thoughtful_X_scaled[))
print(len(training_unthoughtful_X_scaled[))

In [None]:
training_thoughtful_upsampled = resample(training_thoughtful_X_scaled, replace=True, n_samples=1000, random_state=170)
training_upsampled = pd.concat([training_unthoughtful_X_scaled, training_thoughtful_upsampled])

### Testing on validation data after upsampling

In [None]:
# Creating our independent and dependent variables df
X_train = training_upsampled[['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']]
y_train = training_upsampled[['Thoughtful?']]
X_test = validation_X_scaled[['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns']]
y_test = validation_df[['Thoughtful?']]

# Training with differnent models
for model_name in models_dict:
    m = models_dict[model_name]

    # X_train = training_data[X]
    # y_train = training_data[y]
    # X_test = validation_data[X]
    # y_test = validation_data[y]
    
    m.fit(X_train, y_train.values.ravel())
    predictions = m.predict(X_test)

    acc = accuracy_score(y_test,predictions)  # always true label first, then your predicted labels!
    precision = precision_score(y_test,predictions) 
    recall = recall_score(y_test,predictions) 
    f1 = f1_score(y_test,predictions)

    print(model_name)
    print('-'*50)
    print('Accuracy Score for {} is {:.5f}'.format(model_name,acc))
    print('Precision Score for {} is {:.5f}'.format(model_name,precision))
    print('Recall Score for {} is {:.5f}'.format(model_name,recall))
    print('F1 Score for {} is {:.5f}'.format(model_name,f1))
    print()

In [None]:
def get_features_combi(features, n):
    combi = []
    for i in range(3, n+1):
        temp = list(combinations(features, i))
        combi += temp
    return combi

# comb = get_features_combi(['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance Score Category'], 5)
combi = get_features_combi(['Length Category', 'Average Loglikelihood', 'Num Verbs', 'Num Discourse Relations', 'Relevance score', 'Num Pronouns'], 6)

## Test with all possible features combination to find the best model (on validation data)

In [None]:
scores = {
    'features': None, 
    'model': None,
    'acc': 0, 
    'prec': 0,
    'rec': 0,
    'f1': 0
}

for features in comb:
    # Creating our independent and dependent variables df
    X = list(features)
    y = ['Thoughtful?']

    # Training with differnent models
    for model_name in models_dict:
        m = models_dict[model_name]

        X_train = training_upsampled[X]
        y_train = training_upsampled[y]
        X_test = validation_X_scaled[X]
        y_test = validation_df[y]
        
        m.fit(X_train, y_train.values.ravel())
        predictions = m.predict(X_test)

        acc = accuracy_score(y_test,predictions)  # always true label first, then your predicted labels!
        precision = precision_score(y_test,predictions) 
        recall = recall_score(y_test,predictions) 
        f1 = f1_score(y_test,predictions)

        if f1 > scores['f1']:
            scores['features'] = features
            scores['model'] = model_name
            scores['acc'] = acc
            scores['prec'] = precision
            scores['rec'] = recall
            scores['f1'] = f1
scores

### ROC score after upsampling

In [None]:
model1 = GaussianNB() #change this based on the best model performance
prob_upsampled = model1.predict_proba(X_test)
prob_upsampled = [p[1] for p in prob_upsampled]
print(roc_auc_score(y_test, prob_upsampled))

In [None]:
lr_fpr, lr_tpr, _ = roc_curve(y_test, prob_upsampled)
plt.plot(lr_fpr, lr_tpr, marker='.', label='SVM (RBF kernal)') #change the label name based on the best performing model
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()