In [1]:
from textblob import TextBlob
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier ,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import tensorflow 
import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import pandas as pd
import numpy as np

Test-Train

In [2]:
df = pd.read_csv("org_opmin.csv",usecols=["Text","Label"])

In [3]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df["Text"],df["Label"], random_state= 42)
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

Count Vectors

In [4]:
vectorizer = CountVectorizer()
vectorizer.fit(train_x)
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x) #This process creates a vector for each word and applies it to each line.

TF-IDF

In [7]:
tf_idf_word_vec = TfidfVectorizer()
tf_idf_word_vec.fit(train_x)
x_train_tf_idf = tf_idf_word_vec.transform(train_x)
x_test_tf_idf = tf_idf_word_vec.transform(test_x) #Same operation as CountVectors, but more relative.There's no difference.

Logistic Regression

In [5]:
loj = linear_model.LogisticRegression(solver="liblinear")
loj_params = {"penalty":["l1","l2"],
              "tol":[0.0001,0.001,0.01,0.1,0.00001],
              "C":[1,2,3,4,5,0.5,0.1],
              "intercept_scaling":[1,2,3,4,5,0.5,0.1],
              "max_iter": [100,200,500,50,25],}
loj_cv_model = GridSearchCV(loj,loj_params,cv=10,n_jobs=-1,verbose=2)
loj_cv_model.fit(x_train_count,train_y)

Fitting 10 folds for each of 2450 candidates, totalling 24500 fits


In [6]:
loj_cv_model.best_params_

{'C': 1,
 'intercept_scaling': 0.1,
 'max_iter': 100,
 'penalty': 'l2',
 'tol': 0.1}

In [49]:
loj_tuned = linear_model.LogisticRegression(solver="liblinear",C=1,intercept_scaling=0.1,max_iter=100,penalty="l2",tol=1)
loj_tuned = loj_tuned.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(loj_tuned, x_test_count, test_y, cv = 10).mean()
print("Logistic Regression Accuracy Rate:", accuracy)

Logistic Regression Accuracy Rate: 0.7673486786018755


Naive Bayes

In [6]:
nb = naive_bayes.MultinomialNB()
nb_model = nb.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(nb_model, x_test_count, test_y, cv=10).mean()
print("Naive Bayes Accuracy Rate:", accuracy)

Naive Bayes Accuracy Rate: 0.7321398124467178


Support Vector Machine

In [10]:
svc_model = SVC(kernel= "linear").fit(x_train_count,train_y)
svc_params = {"C": np.arange(1,10),"kernel" : ["rbf"],"gamma" :["scale"],"tol" : np.arange(0.001,0.01,0.001)}
svc_cv_model = GridSearchCV(svc_model,svc_params,cv= 10, n_jobs=-1,verbose=2)
svc_cv_model.fit(x_train_count,train_y)


Fitting 10 folds for each of 81 candidates, totalling 810 fits


In [11]:
svc_cv_model.best_params_

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'tol': 0.001}

In [17]:
svc_tuned = SVC( kernel= "rbf",C= 4, gamma= "scale", tol= 0.001)
svc_tuned_model = svc_tuned.fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(svc_tuned_model, x_test_count, test_y, cv=10).mean()
print("Support Vector Machine Accuracy Rate:", accuracy)

Support Vector Machine Accuracy Rate: 0.7570545609548167


Artificial Neural Networks

In [35]:
scaler = StandardScaler(with_mean = False)
x_train_scaled = scaler.fit_transform(x_train_count)
mlp_model = MLPClassifier().fit(x_train_scaled,train_y)
mlp_params = {"alpha": [0.0001,0.0002,0.0003,0.0004,0.001],
               "hidden_layer_sizes" : [(10,10),(100,100),(3,5),(5,3)],
               "solver": ["lbfgs","adam","sgd"],
               "activation":["relu","logistic"]}
mlp_cv_model = GridSearchCV(mlp_model,mlp_params,cv= 10, n_jobs=-1,verbose=2)
mlp_cv_model.fit(x_train_scaled,train_y) #I made a special parameterisation for MLP, but I could not increase the accuracy score very much.


Fitting 10 folds for each of 120 candidates, totalling 1200 fits


In [36]:
mlp_cv_model.best_params_

{'activation': 'logistic',
 'alpha': 0.0001,
 'hidden_layer_sizes': (3, 5),
 'solver': 'lbfgs'}

In [48]:
mlp_tuned_model = MLPClassifier(alpha= 0.0001, activation= "logistic", solver= "lbfgs").fit(x_train_scaled,train_y)
x_test_scaled = scaler.fit_transform(x_test_count)
accuracy = model_selection.cross_val_score(mlp_tuned_model, x_test_scaled, test_y, cv=10).mean()
print("Artificial Neural Networks Accuracy Rate:", accuracy)

Artificial Neural Networks Accuracy Rate: 0.7131074168797954


Classification and Regression Trees (CART)

In [11]:
cart_model = DecisionTreeClassifier().fit(x_train_count,train_y)
cart_params = {"max_depth": list(range(1,20)),
               "min_samples_split":list(range(2,10)),
               "min_samples_leaf": list(range(1,10)),
               "max_leaf_nodes": list(range(2,20)),}
car_cv_model = GridSearchCV(cart_model,cart_params,cv=10,n_jobs= -1, verbose= 1)
car_cv_model.fit(x_train_count,train_y)

Fitting 10 folds for each of 24624 candidates, totalling 246240 fits


In [12]:
car_cv_model.best_params_

{'max_depth': 18,
 'max_leaf_nodes': 19,
 'min_samples_leaf': 3,
 'min_samples_split': 8}

In [23]:
cart_tuned_model = DecisionTreeClassifier(max_depth= 18,max_leaf_nodes= 19,min_samples_leaf= 3,min_samples_split= 8).fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(cart_tuned_model, x_test_count, test_y, cv=10).mean()
print("Classification and Regression Trees Accuracy Rate:", accuracy)

Classification and Regression Trees Accuracy Rate: 0.6529838022165388


Random Forest

In [26]:
rf_model = RandomForestClassifier().fit(x_train_count,train_y)
rf_params = {"n_estimators" : [1000,1500,1700,1800,2000],
             "max_depth": list(range(19,40)),}
rf_cv_model = GridSearchCV(rf_model,rf_params,cv=5,n_jobs= -1, verbose= 15)
rf_cv_model.fit(x_train_count,train_y)

Fitting 5 folds for each of 105 candidates, totalling 525 fits


In [28]:
rf_cv_model.best_params_

{'max_depth': 21, 'n_estimators': 1800}

In [50]:
rf_tuned_model = RandomForestClassifier(max_depth= 21,n_estimators= 1800).fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(rf_tuned_model, x_test_count, test_y, cv=10).mean()
print("Random Forest Accuracy Rate:", accuracy)

Random Forest Accuracy Rate: 0.7466751918158568


Gradient Boosting Machines (GBM)

In [76]:
gbm_model = GradientBoostingClassifier()
gbm_params= {"learning_rate": [0.1,0.2,0.4,0.6,0.8,1],
             "max_depth": [3,5,8,25,50],
             "n_estimators" : [100,200,300],
             "min_samples_split": list(range(2,10)),}
gbm_cv_model = GridSearchCV(gbm_model,gbm_params,cv=5,n_jobs= -1, verbose= 2) #I don't know why but gbm is very slow.
gbm_cv_model.fit(x_train_count,train_y)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


KeyboardInterrupt: 

In [None]:
gbm_cv_model.best_params_

In [82]:
gbm_tuned_model = GradientBoostingClassifier(n_estimators= 500).fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(gbm_tuned_model, x_test_count, test_y, cv=10).mean()
print("GBM Accuracy Rate:", accuracy)

GBM Accuracy Rate: 0.7378729752770673


XGBoost

In [61]:
xgb_model = xgboost.XGBClassifier()
xgb_params = {"n_estimators" : [100,200,400,500],
              "subsample": [0.6,0.8,1],
              "max_depth": [15,20,25,35,50],
              "learning_rate": [0.1,0.2,1,0.05,0.02]}
xgb_cv_model = GridSearchCV(xgb_model,xgb_params,cv=5,n_jobs= -1, verbose= 2)
xgb_cv_model.fit(x_train_count,train_y)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [62]:
xgb_cv_model.best_params_

{'learning_rate': 0.05, 'max_depth': 35, 'n_estimators': 100, 'subsample': 1}

In [73]:
xgb_tuned_model = xgboost.XGBClassifier(learning_rate = 0.05, max_depth = 35,n_estimators= 100, subsample =1).fit(x_train_count,train_y)
accuracy = model_selection.cross_val_score(xgb_tuned_model, x_test_count, test_y, cv=10).mean()
print("XGBoost Accuracy Rate:", accuracy)

XGBoost Accuracy Rate: 0.7129795396419437


Conclusion : I'll use Logistic Regression in this project :(