In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from scipy import stats as st

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:

# Global constants. Note that you will change  rare doc frequency threshold and max_features will be programmatically as part of hyperparameter optimization. I've set a default value for testing purposes



ENGLISH_STOP_WORDS_MODIFIED =[
    'a',
    'about',
    'above',
    'across',
    'after',
    'afterwards',
    #'again',
    'against',
    'ain',
    'all',
    #'almost',
    #'alone',
    'along',
    #'already',
    #'also',
    #'although',
    'always',
    'am',
    'among',
    'amongst',
    'amoungst',
    'amount',
    'an',
    'and',
    #'another',
    #'any',
    'anyhow',
    #'anyone',
    #'anything',
    'anyway',
    #'anywhere',
    'are',
    'aren',
    'around',
    'as',
    'at',
    'back',
    'be',
    'became',
    'because',
    'become',
    'becomes',
    'becoming',
    'been',
    #'before',
    'beforehand',
    #'behind',
    'being',
    #'below',
    'beside',
    'besides',
    'between',
    #'beyond',
    'bill',
    'both',
    'bottom',
    'but',
    'by',
    #'call',
    'can',
    #'cannot',
    #'cant',
    'co',
    'con',
    #'could',
    #'couldn',
    #'couldnt',
    'cry',
    'd',
    'de',
    #'describe',
    #'detail',
    #'did',
    #'didn',
    'do',
    #'does',
    #'doesn',
    'doing',
    'don',
    #'done',
    'down',
    'due',
    'during',
    'each',
    'eg',
    'eight',
    'either',
    'eleven',
    'else',
    'elsewhere',
    'empty',
    #'enough',
    'etc',
    'even',
    'ever',
    'every',
    'everyone',
    'everything',
    'everywhere',
    #'except',
    #'few',
    'fifteen',
    'fify',
    'fill',
    #'find',
    'fire',
    #'first',
    'five',
    'for',
    #'former',
    #'formerly',
    'forty',
    'found',
    'four',
    'from',
    'front',
    'full',
    #'further',
    'get',
    'give',
    'go',
    'had',
    'hadn',
    #'has',
    #'hasn',
    #'hasnt',
    'have',
    'haven',
    'having',
    'he',
    'hence',
    'her',
    'here',
    'hereafter',
    'hereby',
    'herein',
    'hereupon',
    'hers',
    'herself',
    'him',
    'himself',
    'his',
    'how',
    #'however',
    'hundred',
    'i',
    'ie',
    'if',
    'in',
    'inc',
    'indeed',
    'interest',
    'into',
    'is',
    #'isn',
    'it',
    'its',
    'itself',
    #'just',
    #'keep',
    #'last',
    'latter',
    'latterly',
    #'least',
    #'less',
    'll',
    'ltd',
    'm',
    'ma',
    #'made',
    #'many',
    'may',
    'me',
    'meanwhile',
    'might',
    'mightn',
    'mill',
    'mine',
    #'more',
    #'moreover',
    #'most',
    #'mostly',
    'move',
    'much',
    'must',
    'mustn',
    'my',
    'myself',
    'name',
    #'namely',
    #'needn',
    #'neither',
    #'never',
    #'nevertheless',
    #'next',
    'nine',
    'no',
    #'nobody',
    'none',
    'noone',
    'nor',
    'not',
    #'nothing',
    'now',
    'nowhere',
    'o',
    'of',
    'off',
    'often',
    'on',
    'once',
    'one',
    'only',
    'onto',
    'or',
    'other',
    #'others',
    #'otherwise',
    'our',
    'ours',
    'ourselves',
    'out',
    'over',
    'own',
    'part',
    'per',
    'perhaps',
    'please',
    'put',
    #'rather',
    're',
    's',
    'same',
    #'see',
    #'seemed',
    #'seeming',
    #'seems',
    #'serious',
    #'several',
    'shan',
    'she',
    'should',
    'shouldn',
    'show',
    'side',
    'since',
    'sincere',
    'six',
    'sixty',
    'so',
    'some',
    'somehow',
    'someone',
    'something',
    'sometime',
    'sometimes',
    'somewhere',
    'still',
    'such',
    'system',
    't',
    'take',
    'ten',
    'than',
    'that',
    'the',
    'their',
    'theirs',
    'them',
    'themselves',
    'then',
    'thence',
    'there',
    'thereafter',
    'thereby',
    'therefore',
    'therein',
    'thereupon',
    'these',
    'they',
    'thick',
    'thin',
    'third',
    'this',
    'those',
    'though',
    'three',
    'through',
    'throughout',
    'thru',
    'thus',
    'to',
    'together',
    'too',
    #'top',
    'toward',
    'towards',
    'twelve',
    'twenty',
    'two',
    'un',
    'under',
    'until',
    'up',
    'u',  #added
    'wa', #added
    'upon',
    'us',
    've',
    #'very',
    'via',
    'was',
    'wasn',
    'we',
    'well',
    'were',
    'weren',
    'what',
    'whatever',
    'when',
    'whence',
    'whenever',
    'where',
    'whereafter',
    'whereas',
    'whereby',
    'wherein',
    'whereupon',
    'wherever',
    'whether',
    'which',
    'while',
    'whither',
    'who',
    'whoever',
    'whole',
    'whom',
    'whose',
    'why',
    'will',
    'with',
    'within',
    'without',
    'won',
    #'would',
    #'wouldn',
    'y',
    'yet',
    'you',
    'your',
    'yours',
    'yourself',
    'yourselves'
]

# Note; I couldn't figure out how to get the manual features added. I use sklearn vectorizer classes for extracting text features, and I don't know how to easily add manual features.
# If you know how to do this, and in a way that allows those features to avoid the cutoffs due to min/max document frequency//top N features then please add the below in. If we really need to,
# we can do a very roundabout way by making another vectorizer that is run only on text with these features in it, then applying that vectorizer via the .transform() method to our text data, and
# then concatenating the datasets.
MANUAL_FEATURES = [
    "cannot", "unconvinced", "needs", "unclear", "already", "how will", "how do", "unlikely", "why don't", "is not novel work", "isn't novel", "is novel work", "novel work", "is novel", "an accept", "a reject", "convincing", "interesting"] # TODO RETURN TO if time
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return ([self.wnl.lemmatize(t) for t in word_tokenize(doc)])
RARE_DOCUMENT_FREQUENCY_THRESHOLD = 1  # TODO produce multiple datasets for multiple values of this threshold
MAX_FEATURES = 2000;


In [None]:
def read_data(path):
  df = pd.read_csv(path)
  return df

In [None]:
def preprocess(df, column):
  df = df.dropna(subset=[column]).reset_index(drop=True)
  lemmatizer = WordNetLemmatizer()
  stop_words = set(stopwords.words('english'))

  cleaned = []
  for review in df[column]:
    temp = review.lower()
    temp = temp.replace('\n', ' ')
    temp = temp.replace(',', ' ')
    temp = temp.replace('.', ' ')
    temp = temp.replace('!', ' ')
    temp = temp.replace('?', ' ')
    temp = temp.replace(':', ' ')
    temp = temp.replace(';', ' ')
    temp = [lemmatizer.lemmatize(word) for word in temp.split() if word not in stop_words]
    temp = ' '.join(temp)
    cleaned.append(temp)
  df[column] = cleaned
  return df

In [None]:
data_as_given = read_data('data_reviews_filtered.csv')
data_as_given = preprocess(data_as_given, 'comments')

In [None]:
data_as_given['accepted'] = data_as_given['accepted'].astype(str).str.strip().str.capitalize()
data_as_given['accepted'] = data_as_given['accepted'].map({'True': 1, 'False': 0})
data_as_given.head()

Unnamed: 0.1,Unnamed: 0,accepted,title,comments,scoreconf_score,score,conf_score
0,0,0,Boosted Residual Networks,author mention aiming sota result however ense...,,4.0,5.0
1,1,0,Boosted Residual Networks,paper proposes boosting based ensemble procedu...,,3.0,5.0
2,2,0,Boosted Residual Networks,paper consideration proposes set procedure inc...,,3.0,5.0
3,3,0,Boosted Residual Networks,- give detail experiment setup e g parameter t...,,,
4,4,1,Deep Learning with Dynamic Computation Graphs,paper describes novel technique improve effici...,,8.0,3.0


In [None]:
x_data = data_as_given['comments']
y_data = data_as_given['accepted']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

ct_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words=ENGLISH_STOP_WORDS_MODIFIED, min_df=RARE_DOCUMENT_FREQUENCY_THRESHOLD, max_features=MAX_FEATURES, ngram_range=(1,3))
tf_vect = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words = ENGLISH_STOP_WORDS_MODIFIED, min_df=RARE_DOCUMENT_FREQUENCY_THRESHOLD, max_features=MAX_FEATURES, ngram_range=(1,3), use_idf=False)
tfidf_vect = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words = ENGLISH_STOP_WORDS_MODIFIED, min_df=RARE_DOCUMENT_FREQUENCY_THRESHOLD, max_features=MAX_FEATURES, ngram_range=(1,3), use_idf=True)


In [None]:
def svm_train(df, vectorizer, x_train, x_test, y_train, y_test, C, coef0, degree, gamma, kernel):
  x_train = vectorizer.fit_transform(x_train)
  x_test = vectorizer.transform(x_test)

  svm = SVC(C = C, coef0 = coef0, degree = degree, gamma = gamma, kernel=kernel)
  svm.fit(x_train, y_train)
  y_pred = svm.predict(x_test)

  accuracy = accuracy_score(y_test, y_pred)
  print("Accuracy:", accuracy)
  print(classification_report(y_test, y_pred))
  print(confusion_matrix(y_test, y_pred))

  cv_results_f1 = cross_val_score(svm, x_train, y_train, cv=5, scoring='f1')
  print("CV F1 Score:", cv_results_f1.mean())
  cv_results_acc = cross_val_score(svm, x_train, y_train, cv=5, scoring='accuracy')
  print("CV Accuracy Score:", cv_results_acc.mean())

  return svm

In [None]:
def rf_train(df, vectorizer, x_train, x_test, y_train, y_test, n_estimators, criterion, max_features, max_depth, bootstrap):
  x_train = vectorizer.fit_transform(x_train)
  x_test = vectorizer.transform(x_test)

  rf = RandomForestClassifier(class_weight='balanced', n_estimators= n_estimators, criterion= criterion, max_features= max_features, max_depth= max_depth, bootstrap= bootstrap)
  rf.fit(x_train, y_train)
  y_pred = rf.predict(x_test)

  accuracy = accuracy_score(y_test, y_pred)
  print("Accuracy:", accuracy)

  print(classification_report(y_test, y_pred))
  print(confusion_matrix(y_test, y_pred))

  cv_results_f1 = cross_val_score(rf, x_train, y_train, cv=5, scoring='f1')
  print("CV F1 Score:", cv_results_f1.mean())
  cv_results_acc = cross_val_score(rf, x_train, y_train, cv=5, scoring='accuracy')
  print("CV Accuracy Score:", cv_results_acc.mean())

  return rf

In [None]:
def lr_train(df, vectorizer, x_train, x_test, y_train, y_test, penalty, C, max_iter, solver):
  x_train = vectorizer.fit_transform(x_train)
  x_test = vectorizer.transform(x_test)

  lr = LogisticRegression(penalty = penalty, C = C, max_iter = max_iter, solver = solver)
  lr.fit(x_train, y_train)
  y_pred = lr.predict(x_test)

  accuracy = accuracy_score(y_test, y_pred)
  print("Accuracy:", accuracy)
  print(classification_report(y_test, y_pred))

  cv_results_f1 = cross_val_score(lr, x_train, y_train, cv=5, scoring='f1')
  print("CV F1 Score:", cv_results_f1.mean())
  cv_results_acc = cross_val_score(lr, x_train, y_train, cv=5, scoring='accuracy')
  print("CV Accuracy Score:", cv_results_acc.mean())

  return lr

In [None]:
def grid_search(svm, x_train, y_train):
  param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'gamma': ['scale', 'auto'], 'degree': [2, 3, 4, 5], 'coef0': [0.0, 0.1, 0.5, 1.0]}
  grid_search = GridSearchCV(svm, param_grid, cv=5)

  grid_search.fit(x_train, y_train)
  print("Best parameters:", grid_search.best_params_)

In [None]:
def grid_search_rf(rf, x_train, y_train):
  param_grid = {
    'n_estimators': [25, 40, 50],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': ['log2', 'sqrt'],
    'max_depth': [2,4],
    'bootstrap': [True, False]
    }
  grid_search = GridSearchCV(rf, param_grid, cv=5)

  grid_search.fit(x_train, y_train)
  print("Best parameters:", grid_search.best_params_)

In [None]:
def grid_search_lr(rf, x_train, y_train):
  param_grid = {'penalty':['l2'],'C':[1, 10, 100], 'max_iter': [100, 200, 500, 1000], 'solver': ['lbfgs', 'saga', 'liblinear']}
  grid_search = GridSearchCV(rf, param_grid, cv=5)

  grid_search.fit(x_train, y_train)
  print("Best parameters:", grid_search.best_params_)

In [None]:
svm_ct = svm_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, 10, 1.0, 2, 'auto', 'poly')
svm_tf = svm_train(data_as_given, tf_vect, x_train, x_test, y_train, y_test, 10, 1.0, 2, 'auto', 'poly')
svm_tfidf = svm_train(data_as_given, tfidf_vect, x_train, x_test, y_train, y_test, 10, 1.0, 2, 'auto', 'poly')



Accuracy: 0.673728813559322
              precision    recall  f1-score   support

           0       0.66      0.77      0.71       124
           1       0.69      0.57      0.62       112

    accuracy                           0.67       236
   macro avg       0.68      0.67      0.67       236
weighted avg       0.68      0.67      0.67       236

[[95 29]
 [48 64]]
CV F1 Score: 0.5157421573859929
CV Accuracy Score: 0.6351908139142182




Accuracy: 0.5254237288135594
              precision    recall  f1-score   support

           0       0.53      1.00      0.69       124
           1       0.00      0.00      0.00       112

    accuracy                           0.53       236
   macro avg       0.26      0.50      0.34       236
weighted avg       0.28      0.53      0.36       236

[[124   0]
 [112   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


CV F1 Score: 0.0
CV Accuracy Score: 0.5885511651469099




Accuracy: 0.5254237288135594
              precision    recall  f1-score   support

           0       0.53      1.00      0.69       124
           1       0.00      0.00      0.00       112

    accuracy                           0.53       236
   macro avg       0.26      0.50      0.34       236
weighted avg       0.28      0.53      0.36       236

[[124   0]
 [112   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


CV F1 Score: 0.0
CV Accuracy Score: 0.5885511651469099


In [None]:
x_train_ct = ct_vectorizer.fit_transform(x_train)
x_train_tf = tf_vect.fit_transform(x_train)
x_train_idf = tfidf_vect.fit_transform(x_train)



In [None]:
grid_search(svm_ct, x_train_ct, y_train)
grid_search(svm_tf, x_train_tf, y_train)
grid_search(svm_tfidf, x_train_idf, y_train)

Best parameters: {'C': 10, 'coef0': 1.0, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
Best parameters: {'C': 1, 'coef0': 1.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Best parameters: {'C': 10, 'coef0': 0.0, 'degree': 2, 'gamma': 'scale', 'kernel': 'poly'}


In [None]:
print("ct")
svm_ct_tuned = svm_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, 10, 1.0, 2, 'auto', 'poly')
print("tf")
svm_tf_tuned = svm_train(data_as_given, tf_vect, x_train, x_test, y_train, y_test, 1, 1.0, 3, 'scale', 'poly')
print("tfidf")
svm_tfidf_tuned = svm_train(data_as_given, tfidf_vect, x_train, x_test, y_train, y_test, 10, 0.0, 2, 'scale', 'poly')

ct




Accuracy: 0.673728813559322
              precision    recall  f1-score   support

           0       0.66      0.77      0.71       124
           1       0.69      0.57      0.62       112

    accuracy                           0.67       236
   macro avg       0.68      0.67      0.67       236
weighted avg       0.68      0.67      0.67       236

[[95 29]
 [48 64]]
CV F1 Score: 0.5157421573859929
CV Accuracy Score: 0.6351908139142182
tf




Accuracy: 0.652542372881356
              precision    recall  f1-score   support

           0       0.65      0.75      0.69       124
           1       0.66      0.54      0.60       112

    accuracy                           0.65       236
   macro avg       0.65      0.65      0.65       236
weighted avg       0.65      0.65      0.65       236

[[93 31]
 [51 61]]
CV F1 Score: 0.5394243445295472
CV Accuracy Score: 0.6521501744906
tfidf




Accuracy: 0.6779661016949152
              precision    recall  f1-score   support

           0       0.64      0.90      0.74       124
           1       0.79      0.44      0.56       112

    accuracy                           0.68       236
   macro avg       0.71      0.67      0.65       236
weighted avg       0.71      0.68      0.66       236

[[111  13]
 [ 63  49]]
CV F1 Score: 0.4571811637295301
CV Accuracy Score: 0.6691489361702129


In [None]:
rf_ct = rf_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, 25, 'gini', 'sqrt', 2, False)
rf_tf = rf_train(data_as_given, tf_vect, x_train, x_test, y_train, y_test, 25, 'gini', 'sqrt', 2, False)
rf_tfidf = rf_train(data_as_given, tfidf_vect, x_train, x_test, y_train, y_test, 25, 'gini', 'sqrt', 2, False)

grid_search_rf(rf_ct, x_train_ct, y_train)
grid_search_rf(rf_tf, x_train_tf, y_train)
grid_search_rf(rf_tfidf, x_train_idf, y_train)

Accuracy: 0.6228813559322034
              precision    recall  f1-score   support

           0       0.67      0.56      0.61       124
           1       0.59      0.69      0.63       112

    accuracy                           0.62       236
   macro avg       0.63      0.63      0.62       236
weighted avg       0.63      0.62      0.62       236

[[70 54]
 [35 77]]
CV F1 Score: 0.5206743031943518
CV Accuracy Score: 0.5779635258358663




Accuracy: 0.597457627118644
              precision    recall  f1-score   support

           0       0.60      0.73      0.65       124
           1       0.60      0.46      0.52       112

    accuracy                           0.60       236
   macro avg       0.60      0.59      0.59       236
weighted avg       0.60      0.60      0.59       236

[[90 34]
 [61 51]]
CV F1 Score: 0.5334737659194586
CV Accuracy Score: 0.5716030620285939




Accuracy: 0.559322033898305
              precision    recall  f1-score   support

           0       0.58      0.61      0.59       124
           1       0.54      0.50      0.52       112

    accuracy                           0.56       236
   macro avg       0.56      0.56      0.56       236
weighted avg       0.56      0.56      0.56       236

[[76 48]
 [56 56]]
CV F1 Score: 0.49018464697393
CV Accuracy Score: 0.5641224811437577
Best parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 40}
Best parameters: {'bootstrap': True, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 40}
Best parameters: {'bootstrap': False, 'criterion': 'log_loss', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 50}


In [None]:
print("ct")
rf_ct = rf_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, 40, 'gini', 'log2', 4, False)
print("tf")
rf_tf = rf_train(data_as_given, tf_vect, x_train, x_test, y_train, y_test, 40, 'gini', 'sqrt', 2, True)
print("tfidf")
rf_tfidf = rf_train(data_as_given, tfidf_vect, x_train, x_test, y_train, y_test, 50, 'log_loss', 'sqrt', 4, True)

ct




Accuracy: 0.6440677966101694
              precision    recall  f1-score   support

           0       0.69      0.60      0.64       124
           1       0.61      0.70      0.65       112

    accuracy                           0.64       236
   macro avg       0.65      0.65      0.64       236
weighted avg       0.65      0.64      0.64       236

[[74 50]
 [34 78]]
CV F1 Score: 0.5244967135332829
CV Accuracy Score: 0.5895474501857481
tf




Accuracy: 0.5720338983050848
              precision    recall  f1-score   support

           0       0.58      0.68      0.62       124
           1       0.56      0.46      0.50       112

    accuracy                           0.57       236
   macro avg       0.57      0.57      0.56       236
weighted avg       0.57      0.57      0.57       236

[[84 40]
 [61 51]]
CV F1 Score: 0.4847499913436065
CV Accuracy Score: 0.5799898682877406
tfidf




Accuracy: 0.6398305084745762
              precision    recall  f1-score   support

           0       0.64      0.70      0.67       124
           1       0.63      0.57      0.60       112

    accuracy                           0.64       236
   macro avg       0.64      0.64      0.64       236
weighted avg       0.64      0.64      0.64       236

[[87 37]
 [48 64]]
CV F1 Score: 0.5032652283074774
CV Accuracy Score: 0.6171788810086684


In [None]:
lr_ct = lr_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, None, 10, 100, 'saga')
lr_tf = lr_train(data_as_given, tf_vect, x_train, x_test, y_train, y_test, None, 10, 100, 'saga')
lr_tfidf = lr_train(data_as_given, tfidf_vect, x_train, x_test, y_train, y_test, None, 10, 100, 'saga')

grid_search_lr(lr_ct, x_train_ct, y_train)
grid_search_lr(lr_tf, x_train_tf, y_train)
grid_search_lr(lr_tfidf, x_train_idf, y_train)



Accuracy: 0.635593220338983
              precision    recall  f1-score   support

           0       0.63      0.73      0.68       124
           1       0.64      0.54      0.58       112

    accuracy                           0.64       236
   macro avg       0.64      0.63      0.63       236
weighted avg       0.64      0.64      0.63       236





CV F1 Score: 0.5403566724619356




CV Accuracy Score: 0.6489643138579309




Accuracy: 0.6483050847457628
              precision    recall  f1-score   support

           0       0.65      0.70      0.68       124
           1       0.64      0.59      0.61       112

    accuracy                           0.65       236
   macro avg       0.65      0.65      0.65       236
weighted avg       0.65      0.65      0.65       236





CV F1 Score: 0.5342545622824171




CV Accuracy Score: 0.6362490149724193




Accuracy: 0.6779661016949152
              precision    recall  f1-score   support

           0       0.68      0.74      0.71       124
           1       0.68      0.61      0.64       112

    accuracy                           0.68       236
   macro avg       0.68      0.67      0.67       236
weighted avg       0.68      0.68      0.68       236





CV F1 Score: 0.5404673917432106




CV Accuracy Score: 0.6447258808960937




Best parameters: {'C': 1, 'max_iter': 1000, 'penalty': 'l2', 'solver': 'saga'}




Best parameters: {'C': 10, 'max_iter': 200, 'penalty': 'l2', 'solver': 'saga'}




Best parameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}


In [None]:
print("ct")
lr_ct = lr_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, 'l2', 1, 1000, 'saga')
print("tf")
lr_tf = lr_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, 'l2', 10, 200, 'saga')
print("tfidf")
lr_tfidf = lr_train(data_as_given, ct_vectorizer, x_train, x_test, y_train, y_test, 'l2', 10, 100, 'saga')

ct




Accuracy: 0.6398305084745762
              precision    recall  f1-score   support

           0       0.65      0.69      0.67       124
           1       0.63      0.58      0.60       112

    accuracy                           0.64       236
   macro avg       0.64      0.64      0.64       236
weighted avg       0.64      0.64      0.64       236





CV F1 Score: 0.5589471404503066




CV Accuracy Score: 0.652167060677699
tf




Accuracy: 0.6440677966101694
              precision    recall  f1-score   support

           0       0.64      0.72      0.68       124
           1       0.64      0.56      0.60       112

    accuracy                           0.64       236
   macro avg       0.64      0.64      0.64       236
weighted avg       0.64      0.64      0.64       236





CV F1 Score: 0.5504110954079418




CV Accuracy Score: 0.6479004840706969
tfidf




Accuracy: 0.6313559322033898
              precision    recall  f1-score   support

           0       0.63      0.72      0.67       124
           1       0.63      0.54      0.58       112

    accuracy                           0.63       236
   macro avg       0.63      0.63      0.63       236
weighted avg       0.63      0.63      0.63       236





CV F1 Score: 0.5432860050932179




CV Accuracy Score: 0.6489643138579309


