In [54]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix 
from scipy.sparse import csr_matrix

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
data.shape

(7613, 5)

In [5]:
data['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [6]:
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
#data['location'].unique().tolist()

In [8]:
#data['keyword'].unique().tolist()

In [9]:
len(data['keyword'].unique())

222

In [10]:
data = data.fillna('')

In [11]:
#dropping records that have both keyword and location as null
#mask = (data['keyword'].isnull()) & (data['location'].isnull())
#data = data.drop(data[mask].index, inplace=False)

In [12]:
data.shape

(7613, 5)

In [13]:
data.isnull().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

###### Replacing null locations with mode 

In [14]:
#mode_of_location = data.pivot_table(values='location', columns = 'keyword', aggfunc=(lambda x: x.mode()[0]))

In [15]:
#mode_of_location

In [16]:
#missing_values = data['location'].isnull()

In [17]:
#missing_values

In [18]:
#data.loc[missing_values, 'location'] = data.loc[missing_values, 'keyword'].apply(lambda x: mode_of_location)

In [19]:
#data.isnull().sum()

In [20]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [21]:
#dropping ID column
data = data.drop('id', axis = 1)

In [22]:
data.head()

Unnamed: 0,keyword,location,text,target
0,,,Our Deeds are the Reason of this #earthquake M...,1
1,,,Forest fire near La Ronge Sask. Canada,1
2,,,All residents asked to 'shelter in place' are ...,1
3,,,"13,000 people receive #wildfires evacuation or...",1
4,,,Just got sent this photo from Ruby #Alaska as ...,1


In [23]:
data.dtypes

keyword     object
location    object
text        object
target       int64
dtype: object

###### Creating new column to combine keyword and location

In [24]:
data['content'] = data['keyword']+'' + data['location'] + '' + data['text']

In [25]:
data.head()

Unnamed: 0,keyword,location,text,target,content
0,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...


In [26]:
data['content'] = data['content'].astype(str)


In [27]:
X = data.drop(columns = 'target', axis = 1)
Y = data['target']

###### Label Encoder

In [28]:
#data['location'] = data['location'].astype(str)
#data['keyword'] = data['keyword'].astype(str)

In [29]:
#encoder = LabelEncoder()

In [30]:
#data['location'] = encoder.fit_transform(data['location'])
#data['keyword'] = encoder.fit_transform(data['keyword'])

In [31]:
#data['location'].value_counts()

###### Stemming and Vectorizer

In [32]:
#port_stem = PorterStemmer()

In [33]:
'''def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) #this regex is looking for words from a-z only. no numbers. commas and fullstops are replaced with a space as indicated by ' '
    stemmed_content = stemmed_content.lower()#convert everything to lowercase letters
    stemmed_content = stemmed_content.split()#convert everything in content to a list
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] #reducing words to their root word - but for loop is removing all stop words
    stemmed_content = ' '.join(stemmed_content)#joining all words
    return stemmed_content'''

"def stemming(content):\n    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) #this regex is looking for words from a-z only. no numbers. commas and fullstops are replaced with a space as indicated by ' '\n    stemmed_content = stemmed_content.lower()#convert everything to lowercase letters\n    stemmed_content = stemmed_content.split()#convert everything in content to a list\n    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')] #reducing words to their root word - but for loop is removing all stop words\n    stemmed_content = ' '.join(stemmed_content)#joining all words\n    return stemmed_content"

In [34]:
from nltk.stem import SnowballStemmer

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content) 
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    snowball_stem = SnowballStemmer(language='english')
    stemmed_content = [snowball_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [35]:
data['content'] = data['content'].apply(stemming)

In [36]:
print(data['content'])

0               deed reason earthquak may allah forgiv us
1                    forest fire near la rong sask canada
2       resid ask shelter place notifi offic evacu she...
3             peopl receiv wildfir evacu order california
4       got sent photo rubi alaska smoke wildfir pour ...
                              ...                        
7608    two giant crane hold bridg collaps nearbi home...
7609    aria ahrari thetawniest control wild fire cali...
7610             utc km volcano hawaii http co zdtoyd ebj
7611    polic investig e bike collid car littl portug ...
7612    latest home raze northern california wildfir a...
Name: content, Length: 7613, dtype: object


In [37]:
X = data['content'].values
Y = data['target'].values

In [38]:
print(X,Y)

['deed reason earthquak may allah forgiv us'
 'forest fire near la rong sask canada'
 'resid ask shelter place notifi offic evacu shelter place order expect'
 ... 'utc km volcano hawaii http co zdtoyd ebj'
 'polic investig e bike collid car littl portug e bike rider suffer serious non life threaten injuri'
 'latest home raze northern california wildfir abc news http co ymi rskq'] [1 1 1 ... 1 1 1]


In [39]:
vectorizer = TfidfVectorizer() 
vectorizer.fit(X) 
X = vectorizer.transform(X) #convert all values to respective features

In [40]:
print(X.shape, Y.shape)

(7613, 26490) (7613,)


In [41]:
print(X)

  (0, 23830)	0.27583297987492594
  (0, 18710)	0.35176551667005757
  (0, 14357)	0.2944679025040954
  (0, 9096)	0.4601979122885903
  (0, 7023)	0.3264721804901336
  (0, 5620)	0.47567957863899146
  (0, 534)	0.41088754939408534
  (1, 19804)	0.4856420874212593
  (1, 19387)	0.5079192668223127
  (1, 15571)	0.30456128004141125
  (1, 13204)	0.36099543042535365
  (1, 9088)	0.314986937624406
  (1, 8588)	0.22839510059028587
  (1, 3406)	0.36292342770853014
  (2, 20301)	0.5906727775065879
  (2, 18991)	0.2849063437754631
  (2, 17499)	0.46075066109361407
  (2, 16676)	0.22393341828405666
  (2, 16372)	0.22070858653868422
  (2, 15960)	0.3351670792316749
  (2, 7899)	0.23464560831925124
  (2, 7728)	0.18612546903991178
  (2, 1196)	0.238214598136075
  (3, 24971)	0.38031879021445264
  (3, 18726)	0.5701174302881474
  :	:
  (7611, 21570)	0.2522971328474761
  (7611, 20161)	0.2436098288009067
  (7611, 19159)	0.27164163500085636
  (7611, 17727)	0.2956019796087512
  (7611, 17650)	0.17920308086689077
  (7611, 15897)	

###### Train Test split

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)

In [43]:
print(X.shape, X_train.shape, X_test.shape)

(7613, 26490) (6090, 26490) (1523, 26490)


###### NN Model

In [44]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.callbacks import EarlyStopping

# Define the model
model = Sequential([
    Flatten(input_shape=(26490,)),        
    Dense(300, activation='relu'),       
    BatchNormalization(),                
    Dropout(0.5),                       
    Dense(2, activation='sigmoid')      
])

  super().__init__(**kwargs)


In [45]:
#look into optimizers, loss (one hot encoding label encoding)
model.compile(optimizer = 'adam',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['accuracy']
             )

In [46]:
# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [47]:
history = model.fit(X_train, Y_train, validation_split= .1, epochs = 10, callbacks=[early_stopping])

Epoch 1/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step - accuracy: 0.6993 - loss: 0.5747 - val_accuracy: 0.6059 - val_loss: 0.6307
Epoch 2/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.9209 - loss: 0.2123 - val_accuracy: 0.7488 - val_loss: 0.5396
Epoch 3/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.9630 - loss: 0.1118 - val_accuracy: 0.7685 - val_loss: 0.4814
Epoch 4/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.9769 - loss: 0.0745 - val_accuracy: 0.7750 - val_loss: 0.6067
Epoch 5/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.9850 - loss: 0.0516 - val_accuracy: 0.7849 - val_loss: 0.7536
Epoch 6/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.9887 - loss: 0.0393 - val_accuracy: 0.7915 - val_loss: 0.7815


###### RandomForest

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# Define the parameter grid for RandomizedSearchCV
param_grid_random = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Create a RandomForestClassifier
rf = RandomForestClassifier()

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_random, n_iter=30, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Perform random search
random_search.fit(X_train, Y_train)

# Get best hyperparameters from random search
best_params_random = random_search.best_params_

# Define the parameter grid for GridSearchCV using the best parameters from random search
param_grid_grid = {
    'n_estimators': [best_params_random['n_estimators']],
    'max_depth': [best_params_random['max_depth']],
    'min_samples_split': [best_params_random['min_samples_split'] - 1, 
                          best_params_random['min_samples_split'], 
                          best_params_random['min_samples_split'] + 1],
    'min_samples_leaf': [best_params_random['min_samples_leaf'] - 1, 
                         best_params_random['min_samples_leaf'], 
                         best_params_random['min_samples_leaf'] + 1],
    'max_features': [best_params_random['max_features']]
}

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_grid, cv=5, verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, Y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best score found: ", grid_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters found:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best score found:  0.7893267651888342


In [50]:
#Accuracy score
X_train_prediction = grid_search.predict(X_train)
train_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy:', train_data_accuracy)

Accuracy: 0.8958949096880131


In [51]:
#Accuracy score
X_test_prediction = grid_search.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy:', test_data_accuracy)

Accuracy: 0.7925147734734077


###### Model Selection

In [55]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

is_sparse = isinstance(X_train, csr_matrix)

# Feature scaling
scaler = StandardScaler(with_mean=not is_sparse)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define parameter grids for randomized search (coarse search)
logistic_param_grid_coarse = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

xgboost_param_grid_coarse = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

svm_param_grid_coarse = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# RandomizedSearchCV for each model (coarse search)
logistic_random_search_coarse = RandomizedSearchCV(LogisticRegression(), logistic_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
xgboost_random_search_coarse = RandomizedSearchCV(XGBClassifier(objective='binary:logistic'), xgboost_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
svm_random_search_coarse = RandomizedSearchCV(svm.SVC(), svm_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)

# Fit models using RandomizedSearchCV (coarse search)
logistic_random_search_coarse.fit(X_train_scaled, Y_train)
xgboost_random_search_coarse.fit(X_train_scaled, Y_train)
svm_random_search_coarse.fit(X_train_scaled, Y_train)

# Get best hyperparameters from RandomizedSearchCV (coarse search)
best_logistic_params_coarse = logistic_random_search_coarse.best_params_
best_xgboost_params_coarse = xgboost_random_search_coarse.best_params_
best_svm_params_coarse = svm_random_search_coarse.best_params_

# Define parameter grids for GridSearchCV (fine search)
logistic_param_grid_fine = {
    'penalty': [best_logistic_params_coarse['penalty']],
    'C': [best_logistic_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'solver': [best_logistic_params_coarse['solver']]
}

xgboost_param_grid_fine = {
    'learning_rate': [best_xgboost_params_coarse['learning_rate'] * i for i in [0.5, 1, 2]],
    'n_estimators': [best_xgboost_params_coarse['n_estimators']],
    'max_depth': [best_xgboost_params_coarse['max_depth']],
    'min_child_weight': [best_xgboost_params_coarse['min_child_weight']],
    'subsample': [best_xgboost_params_coarse['subsample']],
    'colsample_bytree': [best_xgboost_params_coarse['colsample_bytree']]
}

svm_param_grid_fine = {
    'C': [best_svm_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'gamma': [best_svm_params_coarse['gamma'] * i for i in [0.1, 1, 10]],
    'kernel': [best_svm_params_coarse['kernel']]
}

# GridSearchCV for each model (fine search)
logistic_grid_search_fine = GridSearchCV(LogisticRegression(), param_grid=logistic_param_grid_fine, cv=5, n_jobs=-1)
xgboost_grid_search_fine = GridSearchCV(XGBClassifier(objective='binary:logistic'), param_grid=xgboost_param_grid_fine, cv=5, n_jobs=-1)
svm_grid_search_fine = GridSearchCV(svm.SVC(), param_grid=svm_param_grid_fine, cv=5, n_jobs=-1)

# Fit models using GridSearchCV (fine search)
logistic_grid_search_fine.fit(X_train_scaled, Y_train)
xgboost_grid_search_fine.fit(X_train_scaled, Y_train)
svm_grid_search_fine.fit(X_train_scaled, Y_train)

# Print best hyperparameters from GridSearchCV (fine search)
print("Logistic Regression Best Parameters (Fine Search):", logistic_grid_search_fine.best_params_)
print("XGBoost Best Parameters (Fine Search):", xgboost_grid_search_fine.best_params_)
print("SVM Best Parameters (Fine Search):", svm_grid_search_fine.best_params_)

# Compare cross-validated scores of each model
logistic_cv_score_fine = logistic_grid_search_fine.best_score_
xgboost_cv_score_fine = xgboost_grid_search_fine.best_score_
svm_cv_score_fine = svm_grid_search_fine.best_score_

# Select the best model based on cross-validated scores
best_model_fine = None
if logistic_cv_score_fine >= xgboost_cv_score_fine and logistic_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = logistic_grid_search_fine.best_estimator_
elif xgboost_cv_score_fine >= logistic_cv_score_fine and xgboost_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = xgboost_grid_search_fine.best_estimator_
else:
    best_model_fine = svm_grid_search_fine.best_estimator_

# Evaluate the best model on the test set
train_accuracy_fine = best_model_fine.score(X_train_scaled, Y_train)
print("Best Model Train Accuracy (Fine Search):", train_accuracy_fine)
test_accuracy_fine = best_model_fine.score(X_test_scaled, Y_test)
print("Best Model Test Accuracy (Fine Search):", test_accuracy_fine)



Logistic Regression Best Parameters (Fine Search): {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
XGBoost Best Parameters (Fine Search): {'colsample_bytree': 0.5, 'learning_rate': 0.2, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
SVM Best Parameters (Fine Search): {'C': 0.010000000000000002, 'gamma': 0.001, 'kernel': 'linear'}
Best Model Train Accuracy (Fine Search): 0.8973727422003284
Best Model Test Accuracy (Fine Search): 0.7872619829284307


In [57]:
X_new = X_test[70]
print(Y_test[70])
prediction = best_model_fine.predict(X_new)
print(prediction)

if (prediction[0]==0):
    print('This is fake tweet')
else:
    print('This is real tweet')

1
[1]
This is real tweet
