In [1]:
# importing libraries
import numpy as np
import pandas as pd # to load the datasets
# to preprocess the data
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.corpus import stopwords
# to analyze the data
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import sys
import nltk
import seaborn as sns
# to extract features
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# to build the model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# import the datasets
train_data = pd.read_csv('train_E6oV3lV.csv')
test_data = pd.read_csv('test_tweets_anuFYb8.csv')

In [3]:
class preprocess:
    '''
        Class to pre-process the data.
        
        Functions:
        
        remove_user_handle - to remove the user handle or other patterns from the tweet text.
        
        remove_unnecessary_chars - to remove numbers, punctuations and any special characters from the data.
        
        rm_stopwords_stem - to remove the stop words and performing stemming to remove the suffix from the words.
    ''' 
    def __init__(self):
        self.operators = {'and','or','not'}
        self.stop_words = set(stopwords.words('english'))-self.operators

    def remove_user_handle(self, raw_data, pattern):
        r = re.findall(pattern, raw_data)
        for i  in r:
            raw_data = re.sub(i,'', raw_data)
        return raw_data
    
    def remove_unnecessary_chars(self, data):
        data['clean_tweet'] = data['clean_tweet'].str.replace("[^a-zA-Z#]", " ")
        return data
    
    def rm_stopwords_stem(self, data):
        data['clean_tweet'] = data['clean_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
        split_sent = data['clean_tweet'].apply(lambda x: x.split())
        filtered_sentence = split_sent.apply(lambda tweet: [ps.stem(word) for word in tweet]) #if not word in self.stop_words]) 
        for i in range(len(filtered_sentence)):
            filtered_sentence[i] =' '.join(filtered_sentence[i])
        data['clean_tweet'] = filtered_sentence
        return data

In [4]:
class analyzing_data:
    '''
        Class to analyze the data.
        
        Functions:
        
        understand_word_sentim - to remove the user handle or other patterns from the tweet text.
        
        check_hashtags - to remove numbers, punctuations and any special characters from the data.
        
        distribution_top_N_words - to remove the stop words and performing stemming to remove the suffix from the words.
    ''' 
    def understand_word_sentim(self, data, label):
        list_of_words = ' '.join([text for text in data['clean_tweet'][data['label'] == label]])
        wordcloud = WordCloud(height = 700, width = 1000, random_state  = 21, max_font_size = 30).generate(list_of_words)
        plt.figure(figsize=(10,7))
        plt.imshow(wordcloud, interpolation = 'bilinear')
        plt.axis('off')
        plt.show()
        
    def check_hashtags(self, data, label):
        hashtags = []
        data = data['clean_tweet'][data['label']==label]
        for row in data:
            ht = re.findall(r'#(\w+)', row)
            hashtags.extend(ht)
        return hashtags
    
    def distribution_top_N_words(self, hashtags, N):
        freq = nltk.FreqDist(hashtags)
        df = pd.DataFrame({'Hashtags': list(freq.keys()),
                     'count': list(freq.values())})
        df = df.nlargest(N, 'count', keep='first')
        plt.figure(figsize=(16, 5))
        ax = sns.barplot(data = df, y='count', x = 'Hashtags')
        plt.xlabel('Top 10 words')
        plt.ylabel('count')
        plt.show()

In [5]:
class feature_extraction:
    '''
        Class to apply feature extraction.
        
        Functions:
        
        extract_using_BOW - converting text into features using Bag of words.
        
        extract_using_tfidf - converting text into features using TF-IDF.
    ''' 
    def extract_using_BOW(self, data):
        cv = CountVectorizer(max_df = 0.90, min_df = 2, max_features = 1000, stop_words='english')
        X = cv.fit_transform(data['clean_tweet'])
        return X
    
    def extract_using_tfidf(self, data):
        tf = TfidfVectorizer(max_df = 0.90, min_df = 2, max_features = 1000, stop_words='english')
        X = tf.fit_transform(data['clean_tweet'])
        return X

In [6]:
def split_data(dataset):
    '''
        Function to split the dataset into training and validation data
    '''
    # Separate data into training and validation
    X_train, X_test, Y_train, Y_test = train_test_split(dataset, train_data['label'], test_size = 0.20, random_state = 42)
    return X_train, X_test, Y_train, Y_test

In [7]:
class Classification_Models:
    def LRModel(self, X_train, Y_train):
        classifier_LR = LogisticRegression(solver='liblinear', random_state = 0)
        classifier_LR.fit(X_train, Y_train)
        return classifier_LR
    
    def SVMModel(self, X_train, Y_train):
        classifier_SVM = SVC(kernel = 'linear', C = 1, gamma = 'auto', probability = True, random_state = 0)
        classifier_SVM.fit(X_train, Y_train)
        return classifier_SVM
    
#     def SVMModel_rbf(self, X_train, Y_train):
#         classifier_SVM = SVC(kernel = 'rbf', C = 1, gamma = 'auto', probability = True, random_state = 0)
#         classifier_SVM.fit(X_train, Y_train)
#         return classifier_SVM
    
    def DTModel(self, X_train, Y_train):
        classifier_DT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier_DT.fit(X_train, Y_train)
        return classifier_DT
    
#     def DTModel_gini(self, X_train, Y_train):
#         classifier_DT = DecisionTreeClassifier(criterion = 'gini', random_state = 0) 
#         classifier_DT.fit(X_train, Y_train)
#         return classifier_DT
    
    def RFModel(self, X_train, Y_train):
        classifier_RF = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
        classifier_RF.fit(X_train, Y_train)
        return classifier_RF
    
    def NBModel(self, X_train, Y_train):
        classifier_NB = MultinomialNB(alpha = 1.0)
        classifier_NB.fit(X_train, Y_train)
        return classifier_NB

In [8]:
def calculate_f1score(Y_test, Y_pred, classifier):
    '''
        Function to calculate the F1-score on the validation dataset.
    '''
    score = f1_score(Y_test, Y_pred)
    print('Accuracy: ', accuracy_score(Y_test, Y_pred))
    print('F1 score of classifier ', classifier, ' is: ', score)
    print('\n')
    return score

In [9]:
def calc_predictions(classifier, X_test):
    '''
        Function to predict the outcome based on the trained model.
    '''
    y_pred = classifier.predict_proba(X_test)
    y_pred_int = y_pred[:,1] >= 0.3
    y_pred_int = y_pred_int.astype(np.int)
    return y_pred_int

In [13]:
def best_config(model, parameters, X_train, Y_train):
    print('Grid Search for ', model)
    grid_search = GridSearchCV(estimator = model, 
                               param_grid = parameters, 
                               scoring = 'f1', 
                               cv = 5, 
                               n_jobs = -1)
    grid_search = grid_search.fit(X_train, Y_train)
    return [str(grid_search.best_params_), grid_search.best_estimator_, grid_search.best_score_]

def candidate_param_list():
    model_param_list = []
    
    SVM_params = [{'C': [1,10,100,1000], 'kernel' : ['linear']},
                 {'C':[1,2,10,100], 'kernel' : ['rbf'], 'gamma': [0.5, 0.6, 0.7, 0.8]}]
    model_param_list.append(["SVM", SVM_params])
    
    RF_params = [{'n_estimators' : [10, 100, 200, 250, 400], 'criterion' : ['entropy','gini']}]
    model_param_list.append(["RF", RF_params])
    
    DT_params = [{'criterion': ['entropy', 'gini']}]
    model_param_list.append(["DT", DT_params])
    
    LR_params = [{'solver': ['liblinear']}]
    model_param_list.append(["LR", LR_params])
    
    NB_params = [{'alpha': [0.2,1.0]}]
    model_param_list.append(["NB", NB_params])
    
    return model_param_list

In [14]:
def model_comparison_grid_search(model_param_list, X_train, Y_train):
    '''
        Function for model comparison.
        This function returns the model with the greatest F1-score.
    '''
    max_f1score = 0
    best_classifier = None
    classifier_grid = []
    cm = Classification_Models()
    LR = cm.LRModel(X_train, Y_train)
    SVM = cm.SVMModel(X_train, Y_train)
    DT = cm.DTModel(X_train, Y_train)
    RF = cm.RFModel(X_train, Y_train)
    NB = cm.NBModel(X_train, Y_train)

    for model, parameters in model_param_list:
        print(model)
        print(eval(model))
        classifier_grid.append(best_config(eval(model), parameters, X_train, Y_train))
        print(classifier_grid)
        
    for name, classifier, score in classifier_grid:
        if max_f1score < score:
            max_f1score = score
            best_classifier = classifier
    return best_classifier, max_f1score

In [15]:
# Grid search parameters
model_param_list = candidate_param_list()

In [16]:
# Pre-process the training and test data
p = preprocess()
train_data['clean_tweet'] = np.vectorize(p.remove_user_handle)(train_data['tweet'], "@[\w]*")
p.remove_unnecessary_chars(train_data)
p.rm_stopwords_stem(train_data)

test_data['clean_tweet'] = np.vectorize(p.remove_user_handle)(test_data['tweet'], "@[\w]*")
p.remove_unnecessary_chars(test_data)
p.rm_stopwords_stem(test_data)

Analyzing training data
a = analyzing_data()
# positive sentiments
try:
    print('Positive/Neutral sentiments')
    a.understand_word_sentim(train_data, 0) 
    pos_hashtags = a.check_hashtags(train_data, 0)
    a.distribution_top_N_words(pos_hashtags, 10)
except:
    print(sys.exc_info()[0],": ", sys.exc_info()[1])
# negative sentiments
try:
    print('Negative sentiments')
    a.understand_word_sentim(train_data, 1) 
    neg_hashtags = a.check_hashtags(train_data, 1)
    a.distribution_top_N_words(neg_hashtags, 10)
except:
    print(sys.exc_info()[0],": ", sys.exc_info()[1])

# Using Bag of Words
# Bag of Words (Count Vectorizer)
fe = feature_extraction()
X_BOW = fe.extract_using_BOW(train_data)

# Tfidf Vectorizer
X_TF = fe.extract_using_tfidf(train_data)

# Grid search parameters
model_param_list = candidate_param_list()

print('Using BAG OF WORDS: ')
X_train_bow, X_test_bow, Y_train_bow, Y_test_bow = split_data(X_BOW)
best_classifier_bw, f1_score_bw = model_comparison_grid_search(model_param_list, X_train_bow, Y_train_bow)

# Using TFidf
print('Using TF-IDF: ')
X_train_tf, X_test_tf, Y_train_tf, Y_test_tf = split_data(X_TF)
best_classifier_tf, f1_score_tf = model_comparison_grid_search(model_param_list, X_train_tf, Y_train_tf)

if f1_score_tf > f1_score_bw:
    print('Using TF-IDF for the final predictions.')
#     x_test = fe.extract_using_tfidf(test_data)
    x_test = X_test_tf
    y_test = Y_test_tf
    best_classifier = best_classifier_tf
else:
    print('Using BAG OF WORDS for the final predictions.')
#     x_test = fe.extract_using_BOW(test_data)
    x_test = X_test_bow
    y_test = Y_test_bow
    best_classifier = best_classifier_bw

SVM
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False)
Grid Search for  SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False)
[["{'C': 10, 'gamma': 0.5, 'kernel': 'rbf'}", SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False), 0.615446286744192]]
RF
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min

  'setting alpha = %.1e' % _ALPHA_MIN)


[["{'C': 10, 'gamma': 0.5, 'kernel': 'rbf'}", SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False), 0.615446286744192], ["{'criterion': 'entropy', 'n_estimators': 400}", RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False), 0.5862005319713827], ["{'criterion': 'entropy'}", DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
       

In [17]:
print(best_classifier_tf, f1_score_tf)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
  verbose=False) 0.615446286744192


In [21]:
# # Predict output
# test_data['label'] = calc_predictions(best_classifier, x_test)
# test_data[['id', 'label']].to_csv('test_predictions.csv', index=False)
y_predict = calc_predictions(best_classifier, x_test)
calculate_f1score(y_test, y_predict, 'test dataset')

Accuracy:  0.9513530423901142
F1 score of classifier  test dataset  is:  0.6221142162818954




0.6221142162818954