# Baseline Model selection Experiment-Log
<br>

<b>
    Evaluated a variety a Linear and Non-linear Models provided by Sklearn.
    <br>
    Further Validation of each model was also done using K-Fold validation as it is best suited for a smaller sized data set.

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import GridSearchCV


import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Loading the dataset:

In [3]:
data = pd.read_csv('f_300_cleaned_org.csv')
data.fillna("",inplace = True)

data.replace({'flair': {"AskIndia": 0, "Non-Political": 1, "Scheduled": 2, "Photography": 3,"Science/Technology": 4,
            "Politics": 5,"Business/Finance": 6, "Policy/Economy": 7, "Sports": 8, "Food": 9 }}, inplace=True)

Y = data.flair
V = data.combined_features
X = data.title

print(data.shape)

(2422, 6)


<b>
Defining the various FLairs:
</b>

In [4]:
flairs = ["AskIndia", "Non-Political", "Scheduled", "Photography", "Science/Technology",
            "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food"]

# Defining Blocks of Various Models:
<br>
<b>
    Each Code-Block contains pipelines which first converts the data into a count matrix followed by its tranformation into a normalized tf or tf-idf representation, this helps in document classification.
The data is then passes to the various models.
    <br>
    Then the models are fit with the training data and their accuracies are also calculted.
    </b>

## Naive Bayes:

In [None]:
def nb_classifier(X_train, X_test, y_train, y_test):
    nb = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

## Linear SVM:

In [None]:
def linear_svm(X_train, X_test, y_train, y_test):
    sgd = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

## Logestic Regression:

In [None]:
def logisticreg(X_train, X_test, y_train, y_test):
    logreg = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf', LogisticRegression(n_jobs=1, C=1e5)),])
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

## Randomforest:

In [None]:
def randomforest(X_train, X_test, y_train, y_test):
    ranfor = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                        ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),])
    ranfor.fit(X_train, y_train)
    y_pred = ranfor.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

## Multi-layer Perceptron classifier:

In [None]:
def mlpclassifier(X_train, X_test, y_train, y_test):  
    mlp = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                    ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),])
    mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

## XBG Classifier:

In [None]:
def xgbclassifier(X_train, X_test, y_train, y_test):  
    xgb_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf',
       XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7,objective='multi:softmax')),])
    xgb_clf.fit(X_train, y_train)
    y_pred = xgb_clf.predict(X_test)
    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [None]:
def train_test(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
   
    print("Results of Naive Bayes Classifier")
    nb_classifier(X_train, X_test, y_train, y_test)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of MLP Classifier")
    mlpclassifier(X_train, X_test, y_train, y_test)
    print("Results of XGB Classifier")
    xgbclassifier(X_train, X_test, y_train, y_test)

In [None]:
print("Flair Detection using Title as Feature ------------------------------")
train_test(X,Y)
print("Flair Detection using Combined Features------------------------------")
train_test(V,Y)

# Validation of Each Model:

In [9]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import KFold
from sklearn import metrics
import numpy as np

## Defining the piplines for each Model:

In [47]:
logreg = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
              ('clf', LogisticRegression(n_jobs=1, C=1e5)),])

sgd = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', 
              SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),])

ranfor = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
              ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),])

nb = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])

xgb_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf',
   XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7,objective='multi:softmax')),])
  
mlp = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),])

## Performing K fold Cross Validation:

In [13]:
models = (logreg, sgd, nb, ranfor, xgb_clf, mlp)

count = 0 
for mod in models:
    count = count + 1
    print("--- Model : ",count,"  ---------------------------------------------------------------")
    cv_r2_scores_rf = cross_val_score(mod, V, Y, cv=10,scoring='r2')
    print(cv_r2_scores_rf)
    print("Mean 5-Fold R Squared: {}".format(np.mean(cv_r2_scores_rf)))

--- Model :  1   ---------------------------------------------------------------
[0.17294666 0.28618624 0.15386929 0.54071984 0.4335206  0.20358286
 0.43790019 0.42279468 0.45097558 0.38410931]
Mean 5-Fold R Squared: 0.34866052476698545
--- Model :  2   ---------------------------------------------------------------
[0.47879462 0.35190563 0.24398069 0.54935675 0.37255421 0.17170594
 0.29261592 0.37448455 0.36441994 0.50354251]
Mean 5-Fold R Squared: 0.3703360764576177
--- Model :  3   ---------------------------------------------------------------
[-0.53025089 -0.18699343 -0.55480335 -0.04506559 -0.06691188 -0.18197614
 -0.18231333 -0.14182989 -0.20423046 -0.17510121]
Mean 5-Fold R Squared: -0.22694761627130317
--- Model :  4   ---------------------------------------------------------------
[ 0.08852251  0.34533369  0.32951015  0.41523068  0.19829193 -0.03321715
  0.09072089  0.18728283  0.1273984   0.25404858]
Mean 5-Fold R Squared: 0.20031225166809663
--- Model :  5   ---------------

### Parameter Tuning 

In [46]:
# Create first pipeline for base without reducing features.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 42)

# pipe = Pipeline([('classifier' , RandomForestClassifier())])
# pipe = Pipeline([('classifier', RandomForestClassifier())])
#pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
#                       ('classifier', RandomForestClassifier()),])

# Create param grid.

pipe = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
    ('classifier', SGDClassifier(loss='hinge')),])



param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}
]

grid = {
    'classifier__alpha': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0],
    'classifier__penalty': ['l2'],
}


# Create grid search object

clf = GridSearchCV(pipe, param_grid = grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(X_train, Y_train)

y_pred = best_clf.predict(X_test)
print('accuracy %s' % accuracy_score(X_test, Y_test))
print(classification_report(Y_test, y_pred,target_names=flairs))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


accuracy 0.0
                    precision    recall  f1-score   support

          AskIndia       0.40      0.37      0.39        43
     Non-Political       0.25      0.17      0.20        42
         Scheduled       0.98      0.98      0.98        51
       Photography       0.59      0.88      0.71        51
Science/Technology       0.58      0.37      0.45        59
          Politics       0.59      0.56      0.57        61
  Business/Finance       0.49      0.52      0.50        50
    Policy/Economy       0.34      0.37      0.36        43
            Sports       0.59      0.81      0.68        36
              Food       0.60      0.55      0.57        49

          accuracy                           0.56       485
         macro avg       0.54      0.56      0.54       485
      weighted avg       0.55      0.56      0.55       485



[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.2s finished
