In [27]:

import numpy as np
import pandas as pd
from imblearn.pipeline import Pipeline as imPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from spacy.lang.en.stop_words import STOP_WORDS


In [48]:
# from google.colab import drive
# drive.mount('/content/drive')
# For Google colab only

df = pd.read_csv("../new_clean_sm.csv")
df = df.dropna()


Split data

In [49]:
X = df.reviewText.values
y = df.overall.values

In [50]:
num = len(df)
X, y  = X[:num], y[:num]
le = LabelEncoder()
y = le.fit_transform(y)

In [51]:
# vectoriser = TfidfVectorizer(analyzer='word', max_features=5000)
# X = vectoriser.fit_transform(X)
# rus = RandomUnderSampler()
# X_rus, y_rus = rus.fit_sample(X, y)

In [52]:
#X_train, X_test, y_train, y_test = train_test_split(X_rus, y_rus, test_size=0.33, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [53]:

def train_predict(pipline):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = classification_report(y_test, y_pred)
    #score = roc_auc_score(y_test, y_pred, average="macro")
    print(score)


Embeddings Converter


In [59]:
def make_balance_pipe(steps):
    
    
    x = list(range(len(steps)))
    pipeline= [(str(x),y) for x,y in zip(x,steps)]

    return imPipeline(pipeline)


Naive Bayes


In [60]:
naive_bayes = Pipeline([
    ('classifier', MultinomialNB()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', naive_bayes)
])

# pipeline = make_balance_pipe([TfidfVectorizer(),
#                               RandomUnderSampler(),
#                               MultinomialNB()])


In [None]:
train_predict(pipeline)

  'stop_words.' % sorted(inconsistent))


Max Entropy

In [23]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [24]:

max_ent = Pipeline([
    ('classifier', LogisticRegression(penalty='l2', C=1.0)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', max_ent)
])

In [25]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.20957725946613334


Logistic Regression

In [17]:
params={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
logistic_regression = Pipeline([
    ('classifier', GridSearchCV(LogisticRegression(), params, cv=10)),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', logistic_regression)
])

In [18]:
train_predict(pipeline)




ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



KeyboardInterrupt: 

### Bagging Models

Random Forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
random_forest = Pipeline([
    ('classifier', RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, 
                                      n_iter = 100, cv = 3, verbose=2, random_state=42, 
                                      n_jobs = -1)),
])

random_forest = Pipeline([
    ('classifier', RandomForestClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', random_forest)
])

In [None]:
train_predict(pipeline)

### Boosting Models

XGBoost


In [1]:
# !wget https://s3-us-west-2.amazonaws.com/xgboost-wheels/xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl
# !pip uninstall xgboost --yes
# !pip install xgboost-0.81-py2.py3-none-manylinux1_x86_64.whl

# For Google Colab Only

In [2]:
from xgboost import XGBClassifier

In [None]:
xgb_params = {'eta': 0.3,
              'max_depth': 5,
              'subsample': 0.8,
              'colsample_bytree': 0.8,
              'objective': 'binary:logistic',
              'eval_metric': 'auc',
              'seed': 23,
               'tree_method':"gpu_hist"
             }
params = {
    'max_depth': 3,
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 10,
    'n_gpus': 1
}

xg_boost = Pipeline([
    ('classifier', XGBClassifier(**params))
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', xg_boost)
])


In [None]:
train_predict(pipeline)

AdaBoost


In [36]:
from sklearn.ensemble import AdaBoostClassifier


In [None]:

adaBoost = Pipeline([
    ('classifier', AdaBoostClassifier()),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', adaBoost)
])


In [None]:
train_predict(pipeline)


Voting classifier

In [27]:
from sklearn.ensemble import VotingClassifier



In [41]:
models = [
    MultinomialNB(),
    BernoulliNB(),
    LogisticRegression(),
    SGDClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier()
]

m_names = [m.__class__.__name__ for m in models]

models = list(zip(m_names, models))
vc = VotingClassifier(estimators=models)

### Voting Classifier

In [None]:



votingClassifier = Pipeline([
    ('classifier', vc),
])

pipeline = Pipeline([
    ('vectoriser', tfidf_vectoriser),
    ('classifier', votingClassifier)
])




In [None]:
train_predict(pipeline)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
pipeline = make_balance_pipe([TfidfVectorizer(),RandomUnderSampler(),vc])
train_predict(pipeline)