In [2]:
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize # based on spaces and ponctualisation

text = "Hello world. This is an example sentence."
words = word_tokenize(text)
sentences = sent_tokenize(text)

print("Words:", words)
print("Sentences:", sentences)

Words: ['Hello', 'world', '.', 'This', 'is', 'an', 'example', 'sentence', '.']
Sentences: ['Hello world.', 'This is an example sentence.']


In [1]:
import pandas as pd

train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv('./data/test.csv') 

In [2]:
train_data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [9]:
def clean_text(df , column):
  df[column] = df[column].str.replace('#' , "")
  df[column] = df[column].str.replace('@' , 'at')
  df[column] = df[column].str.replace(r"http\S+", "")
  df[column] = df[column].str.replace(r"http", "")
  df[column] = df[column].str.replace(r"@\S+", "")
  df[column] = df[column].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
  df[column] = df[column].str.lower()
  return df

clean_train = clean_text(train_data , 'text')
clean_train
# our text column is clean
clean_test = clean_text(test_data , 'text')
clean_test.head()
  

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,"heard about earthquake is different cities, st..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,apocalypse lighting. spokane wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


In [11]:
# add a column for list of tokens
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()
clean_train['tokens'] = [ tokenizer.tokenize(tweet) for tweet in clean_train['text']]
clean_test['tokens'] = [tokenizer.tokenize(tweet) for tweet in clean_test['text']]
clean_train

Unnamed: 0,id,keyword,location,text,target,tokens
0,1,,,our deeds are the reason of this earthquake ma...,1,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, ., canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, ', shelter, in, pl..."
3,6,,,"13,000 people receive wildfires evacuation ord...",1,"[13,000, people, receive, wildfires, evacuatio..."
4,7,,,just got sent this photo from ruby alaska as s...,1,"[just, got, sent, this, photo, from, ruby, ala..."
...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1,"[two, giant, cranes, holding, a, bridge, colla..."
7609,10870,,,ataria_ahrary atthetawniest the out of control...,1,"[ataria_ahrary, atthetawniest, the, out, of, c..."
7610,10871,,,m1.94 [01:04 utc]?5km s of volcano hawaii. ://...,1,"[m1, ., 94, [, 01:04, utc, ], ?, 5km, s, of, v..."
7611,10872,,,police investigating after an e-bike collided ...,1,"[police, investigating, after, an, e-bike, col..."


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

x_train , x_test , y_train , y_test = train_test_split(clean_train['text'] , clean_train['target'] , test_size=0.2 , random_state=42)

count_vec = CountVectorizer()
x_train_numerised = count_vec.fit_transform(x_train)
x_test_numerised = count_vec.transform(x_test) # no need to fit because there is in training
x_train_numerised

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 86049 stored elements and shape (6090, 18650)>

In [25]:
from sklearn.metrics import confusion_matrix , accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier




models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector classifier': SVC(),
    'xgboost ' : XGBClassifier(),
    'kneighobors ': KNeighborsClassifier()
}

In [26]:
for model_name, model in models.items():
  print(f'training {model_name} model... ')
  model.fit(x_train_numerised , y_train)
  y_pred = model.predict(x_test_numerised)
  accuracy = accuracy_score(y_pred , y_test)
  print(f'accuracy of {model_name} : ', accuracy)


training Logistic Regression model... 
accuracy of Logistic Regression :  0.8089297439264609
training Decision Tree model... 
accuracy of Decision Tree :  0.7321076822061721
training Random Forest model... 
accuracy of Random Forest :  0.7892317793827971
training Gradient Boosting model... 
accuracy of Gradient Boosting :  0.7636244254760342
training Support Vector Regression model... 
accuracy of Support Vector Regression :  0.8115561391989494
training xgboost  model... 
accuracy of xgboost  :  0.793827971109652
training kneighobors  model... 
accuracy of kneighobors  :  0.685489166119501


In [28]:
# svc is the best with accuracy 0.81 , let's fine tune the hyperparameters
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}
best_model = SVC()
gridsearch = GridSearchCV(best_model , param_grid , verbose=2 , cv=5)
gridsearch.fit(x_train_numerised , y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.8s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.8s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.8s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.9s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   3.9s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   3.4s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   4.5s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   3.6s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   3.6s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   3.4s
[CV] END .....................C=0.1, gamma=1, kernel=sigmoid; total time=   2.5s
[CV] END .....................C=0.1, gamma=1, k

NameError: name 'grid_search' is not defined

In [29]:
print(f"Best Parameters: {gridsearch.best_params_}")
print(f"Best Score: {gridsearch.best_score_}")

Best Parameters: {'C': 100, 'gamma': 0.001, 'kernel': 'sigmoid'}
Best Score: 0.799671592775041


In [32]:
best_model = gridsearch.best_estimator_
predictions = best_model.predict(x_test_numerised)
print(accuracy_score(predictions , y_test))

0.8174655285620486
