In [9]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import tensorflow 
import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import pandas as pd
import numpy as np

Test-Train

In [2]:
df = pd.read_csv("org_opmin.csv",usecols=["Text","Label"])

In [3]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["Text"],df["Label"], random_state= 42)
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

Count Vectors

In [4]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x) #This process creates a vector for each word and applies it to each line.

TF-IDF

In [7]:
tf_idf_word_vec = TfidfVectorizer()
tf_idf_word_vec.fit(train_x)
x_train_tf_idf = tf_idf_word_vec.transform(train_x)
x_test_tf_idf = tf_idf_word_vec.transform(test_x) #Same operation as CountVectors, but more relative.There's no difference.

Logistic Regression

In [5]:
loj = linear_model.LogisticRegression(solver="liblinear")
loj_model = loj.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(loj_model, x_test_count, test_y, cv = 10).mean()
print("Logistic Regression Accuracy Rate:", accuracy)

Logistic Regression Accuracy Rate: 0.7585464620630861


Naive Bayes

In [6]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_count, test_y, cv=10).mean()
print("Naive Bayes Accuracy Rate:", accuracy)

Naive Bayes Accuracy Rate: 0.7321398124467178


Support Vector Machine

In [10]:
svc_model = SVC(kernel= "linear").fit(x_train_count,train_y)
svc_params = {"C": np.arange(1,10),"kernel" : ["rbf"],"gamma" :["scale"],"tol" : np.arange(0.001,0.01,0.001)}
svc_cv_model = GridSearchCV(svc_model,svc_params,cv= 10, n_jobs=-1,verbose=2)
svc_cv_model.fit(x_train_count,train_y)


Fitting 10 folds for each of 81 candidates, totalling 810 fits


In [11]:
svc_cv_model.best_params_

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'tol': 0.001}

In [17]:
svc_tuned = SVC( kernel= "rbf",C= 4, gamma= "scale", tol= 0.001)
svc_tuned_model = svc_tuned.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(svc_tuned_model, x_test_count, test_y, cv=10).mean()
print("Support Vector Machine Accuracy Rate:", accuracy)

Support Vector Machine Accuracy Rate: 0.7570545609548167


Artificial Neural Networks

In [35]:
scaler = StandardScaler(with_mean = False)
x_train_scaled = scaler.fit_transform(x_train_count)
mlp_model = MLPClassifier().fit(x_train_scaled,train_y)
mlp_params = {"alpha": [0.0001,0.0002,0.0003,0.0004,0.001],
               "hidden_layer_sizes" : [(10,10),(100,100),(3,5),(5,3)],
               "solver": ["lbfgs","adam","sgd"],
               "activation":["relu","logistic"]}
mlp_cv_model = GridSearchCV(mlp_model,mlp_params,cv= 10, n_jobs=-1,verbose=2)
mlp_cv_model.fit(x_train_scaled,train_y) #I made a special parameterisation for MLP, but I could not increase the accuracy score very much.


Fitting 10 folds for each of 120 candidates, totalling 1200 fits


In [36]:
mlp_cv_model.best_params_

{'activation': 'logistic',
 'alpha': 0.0001,
 'hidden_layer_sizes': (3, 5),
 'solver': 'lbfgs'}

In [48]:
mlp_tuned_model = MLPClassifier(alpha= 0.0001, activation= "logistic", solver= "lbfgs").fit(x_train_scaled,train_y)
x_test_scaled = scaler.fit_transform(x_test_count)
accuracy = model_selection.cross_val_score(mlp_tuned_model, x_test_scaled, test_y, cv=10).mean()
print("Artificial Neural Networks Accuracy Rate:", accuracy)

Artificial Neural Networks Accuracy Rate: 0.7131074168797954
