Import module

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.svm import SVC
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
from nltk.tokenize import word_tokenize
import re
import warnings 

stop=set(stopwords.words('english'))
warnings.filterwarnings("ignore")
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Load Data

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print('Train Data size :{}'.format(train_df.shape))
print('Test Data size :{}'.format(test_df.shape))
Merge_df = train_df.append(test_df,ignore_index=True)

Train Data size :(7613, 5)
Test Data size :(3263, 4)


Remove twitter handle, URL, http tags, punctuation, special characters, numbers

In [3]:
def remove_pattern(input_txt, pattern):
    reg_obj = re.compile(pattern)
    output_txt = reg_obj.sub(r'', input_txt)

    return output_txt   


Merge_df['text'] = Merge_df['text'].apply(lambda x: remove_pattern(x,"@[\w]*"))
Merge_df['text'] = Merge_df['text'].apply(lambda x: remove_pattern(x,'https?://\S+|www\.\S+'))
Merge_df['text'] = Merge_df['text'].apply(lambda x: remove_pattern(x,'<.*?>'))
Merge_df['text'] = Merge_df['text'].apply(lambda x: remove_pattern(x,"[^a-zA-Z# ]"))

Remove Stop words

In [4]:
def remove_stop_words(text):
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop] 
    filtered_tweet = ' '.join(filtered_sentence)
    
    return filtered_tweet


Merge_df['text'] = Merge_df['text'].apply(lambda x: remove_stop_words(x))

Tokenization & Lemmatizaion

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


def tokenize_stem(text):
    token_words = word_tokenize(text)
    stem_words =[]
    for i in token_words:
        word = lemmatizer.lemmatize(i)
        stem_words.append(word)
        
    final_tweet = ' '.join(stem_words)
    
    return final_tweet


Merge_df['text'] = Merge_df['text'].apply(lambda x: tokenize_stem(x))

Output cleaned data

In [6]:
training_df = Merge_df[:7613]
testing_df = Merge_df[7613:]

training_df.to_csv("train_clean.csv")
testing_df.to_csv("test_clean.csv")

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=300, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(Merge_df['text'])

train_data  = tfidf[:7613]
train_label = Merge_df[:7613]['target']
test_data = tfidf[7613:]

labels = ['good', 'bad']

In [8]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report

kf = KFold(n_splits=3)

test = []
pred = []

for train_index, test_index in kf.split(train_data):
  x_train, x_test = train_data[train_index], train_data[test_index]
  y_train, y_test = train_label[train_index], train_label[test_index]
  test.extend(y_test)
  
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  pred.extend(y_pred)

print('Confusion matrix')
print(confusion_matrix(test, pred)/3)
print(classification_report(test, pred, target_names=labels))

Confusion matrix
[[1137.66666667  309.66666667]
 [ 460.66666667  629.66666667]]
              precision    recall  f1-score   support

        good       0.71      0.79      0.75      4342
         bad       0.67      0.58      0.62      3271

    accuracy                           0.70      7613
   macro avg       0.69      0.68      0.68      7613
weighted avg       0.69      0.70      0.69      7613



In [10]:
submission = pd.read_csv("sample_submission.csv")

submission['target'] = model.predict(test_data).astype(int)
submission.to_csv('MultinomialNB_submission.csv', index=False)

In [11]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=500)

In [12]:
kf = KFold(n_splits=3)

test = []
pred = []

for train_index, test_index in kf.split(train_data):
  x_train, x_test = train_data[train_index], train_data[test_index]
  y_train, y_test = train_label[train_index], train_label[test_index]
  test.extend(y_test)
  
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  pred.extend(y_pred)

print('Confusion matrix')
print(confusion_matrix(test, pred)/3)
print(classification_report(test, pred, target_names=labels))

Confusion matrix
[[1112.33333333  335.        ]
 [ 468.          622.33333333]]
              precision    recall  f1-score   support

        good       0.70      0.77      0.73      4342
         bad       0.65      0.57      0.61      3271

    accuracy                           0.68      7613
   macro avg       0.68      0.67      0.67      7613
weighted avg       0.68      0.68      0.68      7613



In [13]:
submission = pd.read_csv("sample_submission.csv")

submission['target'] = model.predict(test_data).astype(int)
submission.to_csv('RandomForestClassifier_submission.csv', index=False)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

params = [
    {'solver':['liblinear','lbfgs'], 'max_iter':[200,400]},
]

model = GridSearchCV(LogisticRegression(), params, scoring='accuracy', cv=5)

In [15]:
kf = KFold(n_splits=3)

test = []
pred = []

for train_index, test_index in kf.split(train_data):
  x_train, x_test = train_data[train_index], train_data[test_index]
  y_train, y_test = train_label[train_index], train_label[test_index]
  test.extend(y_test)
  
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
  pred.extend(y_pred)

print('Confusion matrix')
print(confusion_matrix(test, pred)/3)
print(classification_report(test, pred, target_names=labels))

Confusion matrix
[[1238.66666667  208.66666667]
 [ 557.33333333  533.        ]]
              precision    recall  f1-score   support

        good       0.69      0.86      0.76      4342
         bad       0.72      0.49      0.58      3271

    accuracy                           0.70      7613
   macro avg       0.70      0.67      0.67      7613
weighted avg       0.70      0.70      0.69      7613



In [16]:
submission = pd.read_csv("sample_submission.csv")

submission['target'] = model.predict(test_data).astype(int)
submission.to_csv('LogisticRegression_submission.csv', index=False)