In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# **PRE-PROCESSING**

In [None]:
############################################################# PREPROCESSING

import numpy as np 
import pandas as pd 
import nltk
import string

from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

def remove_stopwords(text) :
  arr_of_words=text.split()
  return_text=''
  for word in arr_of_words :
    if word not in stop :
      return_text+=word
      return_text+=' '
  return return_text

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def make_lower(text) :
  lower_text=text.lower()
  return lower_text

def remove_numbers(text) :
  alpha_text=''
  arr_of_words=text.split()
  for i in arr_of_words :
    try :
      x=int(i)
    except ValueError :
      alpha_text+=i
      alpha_text+=' '
  return alpha_text

def deal_media(text) :
  arr_of_words=text.split()
  final_text=''
  for i in arr_of_words :
    if 'http://' in i :
      word='aszxdcfvgb'
    else :
      word=i
    final_text+=word
    final_text+=' '
  return final_text

def remove_nan(text) :
  text = text.replace(' nan ', '')
  return text



train_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/ML_Project/ML_data/train.csv") 
test_df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/ML_Project/ML_data/test.csv")

#train_df = pd.read_csv("/content/drive/My Drive/ML_data/train.csv") 
#test_df = pd.read_csv("/content/drive/My Drive/ML_data/test.csv")

#adding location to text
train_df["text"] = train_df["text"] + ' ' + train_df["location"].astype(str) +' '
test_df["text"] = test_df["text"] + ' ' + test_df["location"].astype(str) +' '


train_df["text"] = train_df["text"].apply(remove_nan)
train_df["text"] = train_df["text"].apply(deal_media)
train_df["text"] = train_df["text"].apply(remove_stopwords)
train_df["text"] = train_df["text"].apply(remove_punctuations)
train_df["text"] = train_df["text"].apply(make_lower)
train_df["text"] = train_df["text"].apply(remove_numbers)

test_df["text"] = test_df["text"].apply(remove_nan)
test_df["text"] = test_df["text"].apply(deal_media)
test_df["text"] = test_df["text"].apply(remove_stopwords)
test_df["text"] = test_df["text"].apply(remove_punctuations)
test_df["text"] = test_df["text"].apply(make_lower)
test_df["text"] = test_df["text"].apply(remove_numbers)


x_train=train_df["text"]
y_train=train_df["target"]
x_test=test_df["text"]


print("Preprocessing done")
print("")
print("Locations added to text")
print("Links were removed")
print("Stopwords removed")
print("punctuations removed")
print("Numbers removed")
print("Lowered the text")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Preprocessing done

Locations added to text
Links were removed
Stopwords removed
punctuations removed
Numbers removed
Lowered the text


# **VECTORIZATION**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer()
tv = TfidfVectorizer()

x_traincv = cv.fit_transform(x_train)
x_traintv = tv.fit_transform(x_train)

x_testcv = cv.transform(x_test)
x_testtv = tv.transform(x_test)

print("x train and x test are formed ")

x train and x test are formed 


# **RF Classifier GridSearch**

In [4]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier()


parameters_for_n_estimator=[40, 50, 60]
parameters_for_max_features=['auto', 'sqrt']
parameters_for_n_jobs=[-1]


params={'n_estimators' : parameters_for_n_estimator, 'max_features' :parameters_for_max_features, 'n_jobs' :parameters_for_n_jobs}
gs=GridSearchCV(estimator=rfc, param_grid=params, cv=5)

gs = gs.fit(x_traincv, y_train)
best_params=gs.best_params_
accuracy=gs.best_score_
print("For RF Classifier (Count Vectorizer):")
print("Best hyperparameter :", best_params)
print('Accuracy of train data :', accuracy)
print("")

gs = gs.fit(x_traintv, y_train)
best_params=gs.best_params_
accuracy=gs.best_score_
print("For RF Classifier (Tfid Vectorizer):")
print("Best hyperparameter :", best_params)
print('Accuracy of train data :', accuracy)


For RF Classifier (Count Vectorizer):
Best hyperparameter : {'max_features': 'sqrt', 'n_estimators': 40, 'n_jobs': -1}
Accuracy of train data : 0.7371563404063665

For RF Classifier (Tfid Vectorizer):
Best hyperparameter : {'max_features': 'sqrt', 'n_estimators': 50, 'n_jobs': -1}
Accuracy of train data : 0.7366302848223861


# **Accuracy metrics**

In [2]:

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

rfc = RandomForestClassifier(max_features='sqrt', n_estimators=40, n_jobs=-1)
y_pred = cross_val_predict(rfc, x_traincv, y_train, cv=5)
accuracy = accuracy_score(y_train, y_pred)
print("For RF Classifier (Count Vectorizer):")
print("Accuracy :", accuracy)
f1 = f1_score(y_train, y_pred)
print("F1 Score :", f1)
conf_mat = confusion_matrix(y_train, y_pred)
print("Confusion Matrix :")
print(conf_mat)
print("")


rfc = RandomForestClassifier(max_features='sqrt', n_estimators=50, n_jobs=-1)
y_pred = cross_val_predict(rfc, x_traintv, y_train, cv=5)
accuracy = accuracy_score(y_train, y_pred)
print("For RF Classifier (Count Vectorizer):")
print("Accuracy :", accuracy)
f1 = f1_score(y_train, y_pred)
print("F1 Score :", f1)
conf_mat = confusion_matrix(y_train, y_pred)
print("Confusion Matrix :")
print(conf_mat)



For RF Classifier (TFID Vectorizer):
Accuracy : 0.7363601733876264
F1 Score : 0.6500308641975309
Confusion Matrix :
[[3751  591]
 [1949 1322]]

For RF Classifier (Count Vectorizer):
Accuracy : 0.7387245501116511
F1 Score : 0.6476142131979695
Confusion Matrix :
[[3791  551]
 [1971 1300]]

