In [None]:
import pandas as pd
df=pd.read_json(r'H:\Uni Docs\Third Semester\Data Science Lab\Code\Dataset\Final_Chai\Final All\migrantPhase3Final_ALL.json', lines="True", orient="records", encoding="utf8")
#df['label'].value_counts()

In [None]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter, defaultdict
from tabulate import tabulate
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [None]:
#Tokenisation
import re,nltk
from nltk.tokenize import word_tokenize

df['text']=df['text'].apply(lambda row: (re.sub(r'((RT @[^ ]+)|(http[^ ]*)|([—–])|([!”#$%&\’\'\€\✡"\‘()*+\“,-\.\/:;<=>?@[\]^_`{|}~…])|(([^A-Za-z0-9 ])))','', row, flags=re.IGNORECASE)))

df['cleanedtext']=df['text'].apply(lambda row: (re.sub(r'([0-9]+)','c2a0f1s9', row, flags=re.IGNORECASE)))

df['tokenised'] = df['cleanedtext'].apply(lambda row: nltk.word_tokenize(row.lower()))   

#len(df['tokenised'])
#df['tokenised'].value_counts()

In [None]:
#Stopwords Removal
from nltk.corpus import stopwords
stop = stopwords.words('english')
'theyre', 'thats', 'its'
stop.append('theyre')
stop.append('thats')
stop.append('its')
df['stopwords']=df['tokenised'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
#Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
df['stemmed']=df.apply(lambda row: [ps.stem(word) for word in row['tokenised']], axis=1) 

In [None]:
X = df['stemmed'] 
y = df['label']

# Feature Extraction

In [None]:
#Building a Word2Vec model
import gensim
# Here X is list of tokenized texts (i.e. list of lists of tokens)
model = gensim.models.Word2Vec(X, size=100, window=5, min_count=1, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)} #Words and corresponding vectors are stored as Dictionary

In [None]:
#Method For Word2Vec Model that uses mean as aggregation function
class MeanAggrFunc(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(w2v))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([np.mean([self.word2vec[w] for w in words if w in self.word2vec] or [np.zeros(self.dim)], axis=0) for words in X])

In [None]:
#Method for TFIDF Word2Vec Model
class Tfidfw2vFunc(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(w2v))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:
#Classifiers with Tf-idf features
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(ngram_range=(1, 2), stop_words='english',max_df=0.50, min_df=1,analyzer=lambda x: x)), ("svc_tfidf", SVC(kernel="linear"))])
percept_tfidf =  Pipeline([("tfidf_vectorizer", TfidfVectorizer(ngram_range=(1, 2), stop_words='english',max_df=0.50, min_df=1,analyzer=lambda x: x)), ("Percept_tfidf", Perceptron(max_iter=5))])
etree_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(ngram_range=(1, 2), stop_words='english',max_df=0.50, min_df=1,analyzer=lambda x: x)), ("etree_tfidf", ExtraTreesClassifier(n_estimators=200))])
rand_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(ngram_range=(1, 2), stop_words='english',max_df=0.50, min_df=1,analyzer=lambda x: x)), ("rand_tfidf", RandomForestClassifier())])

In [None]:
#Classifiers with Word2Vec features
etree_w2v = Pipeline([
    ("word2vec vectorizer", MeanAggrFunc(w2v)),
    ("etree_w2v", ExtraTreesClassifier(n_estimators=200))])

rand_w2v = Pipeline([
    ("word2vec vectorizer", MeanAggrFunc(w2v)),
    ("rand_w2v", RandomForestClassifier())])

In [None]:
#Classifiers with combination of Tf-idf and Word2Vec features
etree_combo = Pipeline([
    ("word2vec vectorizer", Tfidfw2vFunc(w2v)),
    ("etree_combo", ExtraTreesClassifier(n_estimators=200))])

rand_combo = Pipeline([
    ("word2vec vectorizer", Tfidfw2vFunc(w2v)),
    ("rand_combo", RandomForestClassifier())])

In [None]:
Model_Set = [
    ("svc_tfidf", svc_tfidf),
    ("Percept_tfidf", percept_tfidf),
    ("etree_tfidf", etree_tfidf),
    ("rand_tfidf",rand_tfidf),
    ("etree_w2v", etree_w2v),
    ("rand_w2v",rand_w2v),
    ("etree_combo", etree_combo),
    ("rand_combo",rand_combo),
]

# Extraction

In [None]:
#Evalaution of models using k-fold Cross validation
result_accuracy = [(name, cross_val_score(model, X, y,scoring='accuracy', cv=10).mean()) for name, model in Model_Set]
result_precision =[(name, cross_val_score(model, X, y,scoring='precision', cv=10).mean()) for name, model in Model_Set]
result_recall = [(name, cross_val_score(model, X, y,scoring='recall', cv=10).mean()) for name, model in Model_Set]
result_f1 = [(name, cross_val_score(model, X, y,scoring='f1', cv=10).mean()) for name, model in Model_Set]

print (tabulate(result_accuracy, floatfmt=".4f", headers=("model", 'Accuracy')))
print (tabulate(result_precision, floatfmt=".4f", headers=("model", 'Precision')))
print (tabulate(result_recall, floatfmt=".4f", headers=("model", 'Recall')))
print (tabulate(result_f1, floatfmt=".4f", headers=("model", 'F1_Score')))

In [None]:
#Voting Classifer and its performance using k-fold
classifier=VotingClassifier(estimators=[    
    ("svc_tfidf", svc_tfidf),
    ("Percept_tfidf", percept_tfidf),
    ("etree_tfidf", etree_tfidf),
    ("rand_tfidf",rand_tfidf),
    ("etree_w2v", etree_w2v),
    ("rand_w2v",rand_w2v),
    ("etree_combo", etree_combo),
    ("rand_combo",rand_combo)], voting='hard')

result_accuracy = cross_val_score(classifier, X, y, scoring='accuracy', cv=10)
result_precision = cross_val_score(classifier, X, y, scoring='precision', cv=10)
result_recall = cross_val_score(classifier, X, y, scoring='recall', cv=10)
result_f1 = cross_val_score(classifier, X, y, scoring='f1', cv=10)

print(result_accuracy.mean())
print(result_precision.mean())
print(result_recall.mean())
print(result_f1.mean())

In [None]:
#Model set with Voting Classifier
Model_Set_with_voting_clf= [("svc_tfidf", svc_tfidf),
    ("Percept_tfidf", percept_tfidf),
    ("etree_tfidf", etree_tfidf),
    ("rand_tfidf",rand_tfidf),
    ("etree_w2v", etree_w2v),
    ("rand_w2v",rand_w2v),
    ("etree_combo", etree_combo),
    ("rand_combo",rand_combo),
    ("voting_classifier",classifier)
]

In [None]:
#Evaluation of Performance of various classifiers using Stratified Shuffle Split
def PerfEval(model, X, y, n):
    test_size = 1 - (n / float(len(y)))
    scores = []
    for train, test in StratifiedShuffleSplit(y, n_iter=10, test_size=test_size):
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        scores.append(f1_score(model.fit(X_train, y_train).predict(X_test), y_test)) #For other performance metrics, Change 'f1_score' to 'accuracy_score' or 'precision_score' or 'recall_score'
    return np.mean(scores)

In [None]:
#Setting train sizes for Stratified Shuffle Split
train_sizes = [10,40,160,640,1000,1400,2500]
table = []
for name, model in Model_Set_with_voting_clf:
    for n in train_sizes:
        table.append({'model': name, 
                      'f1': PerfEval(model, X, y, n), 
                      'train_size': n})
df = pd.DataFrame(table)
df

In [None]:
#Plotting of models trained with varied features by Stratified Shuffle Split
plt.figure(figsize=(10, 6))
fig = sns.pointplot(x='train_size', y='f1', hue='model', 
                    data=df[df.model.map(lambda x: x in ["svc_tfidf",
                                                         "Percept_tfidf",
                                                         "etree_tfidf",
                                                         "rand_tfidf",
                                                         "etree_w2v",
                                                         "rand_w2v",
                                                         "etree_combo",
                                                         "rand_combo",
                                                         "voting_classifier"
                                                        ])])

sns.set_context("notebook", font_scale=1)
fig.set(xlabel="Labeled Training Samples")
fig.set(ylabel="F1-Score")