In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier

In [2]:
pd.options.display.max_colwidth = 200
dftest1 = pd.read_csv("test1.csv")
dftrain1 = pd.read_csv("train1.csv")

In [3]:
dftest1["text"].shape

(3263,)

In [4]:
dftrain1["text"].shape

(7613,)

In [5]:
dftrain1

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1


In [6]:
print(dftest1.isnull().sum())
print(dftrain1.isnull().sum())

id             0
keyword       26
location    1105
text           0
dtype: int64
id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [7]:
dftrain1.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [8]:
print(dftest1.info())
print(dftrain1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None


In [9]:
print(dftest1.keys())
print(dftrain1.keys())

Index(['id', 'keyword', 'location', 'text'], dtype='object')
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')


### Run the models with the PorterStemmer and CountVectorizer
#### I know I can iterate over these, but I prefer it this way. 
#### I am running a MultinomialNB, GaussianNB, Decision Tree, Logistic Regression, Random Forest, KNN, and a Ridge Classifier.

In [10]:
# Clean the tweets
stemmer = PorterStemmer()
stop_words = stopwords.words("english")
def clean_tweet(X5):
    if type(X5) == np.float:
        return ""
    tweet = X5.lower()
    tweet = re.sub("'", "", tweet) 
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub('\[.*?\]',' ', tweet)
    tweet = re.sub("[^a-z0-9]"," ", tweet)
    tweet = tweet.split()
    tweet =[stemmer.stem(word) for word in tweet if (word not in stop_words)]
    tweet = " ".join(word for word in tweet)
    
    return tweet

In [11]:
X5 = dftrain1["text"]
trainclean = [clean_tweet(tw) for tw in X5]
trainclean

['deed reason may allah forgiv us',
 'forest fire near la rong sask canada',
 'resid ask shelter place notifi offic evacu shelter place order expect',
 '13 000 peopl receiv evacu order california',
 'got sent photo rubi smoke pour school',
 'updat california hwi 20 close direct due lake counti fire',
 'heavi rain caus flash flood street manit colorado spring area',
 'im top hill see fire wood',
 'there emerg evacu happen build across street',
 'im afraid tornado come area',
 'three peopl die heat wave far',
 'haha south tampa get flood hah wait second live south tampa gonna gonna fvck',
 '18 19 day ive lost count',
 'bago myanmar arriv bago',
 'damag school bu 80 multi car crash',
 'what man',
 'love fruit',
 'summer love',
 'car fast',
 'goooooooaaaaaal',
 'ridicul',
 'london cool',
 'love ski',
 'wonder day',
 'looooool',
 'way cant eat shit',
 'nyc last week',
 'love girlfriend',
 'cooool',
 'like pasta',
 'end',
 'wholesal market ablaz',
 'alway tri bring heavi',
 'break news niger

In [12]:
cv = CountVectorizer()
X = cv.fit_transform(trainclean).toarray()
y = dftrain1["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.77      0.86      0.81      1244
           1       0.80      0.68      0.74      1040

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.78      2284



In [14]:
model1 = GaussianNB()
model1.fit(X_train, y_train)
y_pred=model1.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.71      0.44      0.55      1244
           1       0.54      0.79      0.64      1040

    accuracy                           0.60      2284
   macro avg       0.63      0.62      0.59      2284
weighted avg       0.64      0.60      0.59      2284



In [15]:
model2 = DecisionTreeClassifier()
model2.fit(X_train, y_train)
y_pred=model2.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.74      0.73      0.74      1244
           1       0.68      0.69      0.69      1040

    accuracy                           0.71      2284
   macro avg       0.71      0.71      0.71      2284
weighted avg       0.71      0.71      0.71      2284



In [16]:
model3 = LogisticRegression()
model3.fit(X_train, y_train)
y_pred=model3.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.76      0.88      0.82      1244
           1       0.82      0.67      0.74      1040

    accuracy                           0.79      2284
   macro avg       0.79      0.78      0.78      2284
weighted avg       0.79      0.79      0.78      2284



In [17]:
model4 = RandomForestClassifier(n_estimators = 100)
model4.fit(X_train, y_train)
y_pred=model4.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.75      0.82      0.78      1244
           1       0.76      0.68      0.71      1040

    accuracy                           0.75      2284
   macro avg       0.75      0.75      0.75      2284
weighted avg       0.75      0.75      0.75      2284



In [18]:
model5 = KNeighborsClassifier(n_neighbors = 3)
model5.fit(X_train, y_train)
y_pred=model5.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.70      0.54      0.61      1244
           1       0.57      0.72      0.64      1040

    accuracy                           0.62      2284
   macro avg       0.63      0.63      0.62      2284
weighted avg       0.64      0.62      0.62      2284



In [19]:
model6 = RidgeClassifier()
model6.fit(X_train, y_train)
y_pred = model6.predict(X_test)
cf = classification_report(y_test, y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.75      0.85      0.80      1244
           1       0.78      0.67      0.72      1040

    accuracy                           0.76      2284
   macro avg       0.77      0.76      0.76      2284
weighted avg       0.77      0.76      0.76      2284



### Seems like GaussianNB and KNN have the worst scores by far, so we'll drop those from further analysis.

### Run the models with the PorterStemmer and TfidfVectorizer

In [21]:
tf = TfidfVectorizer()
X1_idf = tf.fit_transform(trainclean).toarray()
y1 = dftrain1["target"]
X_train, X_test, y_train, y_test = train_test_split(X1_idf, y1, test_size=0.3)

In [22]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.77      0.91      0.83      1283
           1       0.85      0.65      0.74      1001

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.79      2284
weighted avg       0.81      0.80      0.79      2284



In [23]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
y_pred=model2.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.75      0.72      0.73      1283
           1       0.66      0.68      0.67      1001

    accuracy                           0.70      2284
   macro avg       0.70      0.70      0.70      2284
weighted avg       0.71      0.70      0.71      2284



In [24]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)
y_pred=model3.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.76      0.91      0.83      1283
           1       0.85      0.63      0.73      1001

    accuracy                           0.79      2284
   macro avg       0.80      0.77      0.78      2284
weighted avg       0.80      0.79      0.78      2284



In [25]:
model3 = RandomForestClassifier(n_estimators = 100)
model3.fit(X_train, y_train)
y_pred=model4.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.76      0.85      0.80      1283
           1       0.77      0.67      0.71      1001

    accuracy                           0.77      2284
   macro avg       0.77      0.76      0.76      2284
weighted avg       0.77      0.77      0.76      2284



In [26]:
model4 = RidgeClassifier()
model4.fit(X_train, y_train)
y_pred = model6.predict(X_test)
cf = classification_report(y_test, y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.77      0.86      0.81      1283
           1       0.79      0.67      0.72      1001

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.77      2284



### Running the models with a lemmatizer and Count Vectorizer

In [27]:
lemma = WordNetLemmatizer()
stop_words = stopwords.words("english")
def clean_tweet2(X6):
    if type(X6) == np.float:
        return ""
    tweet = X6.lower()
    tweet = re.sub("'", "", tweet) 
    tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub('[()!?]', ' ', tweet)
    tweet = re.sub('\[.*?\]',' ', tweet)
    tweet = re.sub("[^a-z0-9]"," ", tweet)
    tweet = tweet.split()
    tweet =[lemma.lemmatize(word) for word in tweet if (word not in stop_words)]
    tweet = " ".join(word for word in tweet)
    
    return tweet

In [28]:
X6 = dftrain1["text"]
trainclean1 = [clean_tweet2(tw) for tw in X6]
trainclean1

['deed reason may allah forgive u',
 'forest fire near la ronge sask canada',
 'resident asked shelter place notified officer evacuation shelter place order expected',
 '13 000 people receive evacuation order california',
 'got sent photo ruby smoke pours school',
 'update california hwy 20 closed direction due lake county fire',
 'heavy rain cause flash flooding street manitou colorado spring area',
 'im top hill see fire wood',
 'there emergency evacuation happening building across street',
 'im afraid tornado coming area',
 'three people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck',
 '18 19 day ive lost count',
 'bago myanmar arrived bago',
 'damage school bus 80 multi car crash',
 'whats man',
 'love fruit',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'ridiculous',
 'london cool',
 'love skiing',
 'wonderful day',
 'looooool',
 'way cant eat shit',
 'nyc last week',
 'love girlfriend',
 'cooool',
 'like pasta',
 

In [30]:
cv = CountVectorizer()
X2 = cv.fit_transform(trainclean1).toarray()
y2 = dftrain1["target"]
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.3)

In [31]:
model=MultinomialNB()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
model.fit(X_train, y_train)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.78      0.84      0.81      1292
           1       0.77      0.70      0.73       992

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.78      2284



In [32]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
y_pred=model1.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1292
           1       0.66      0.67      0.67       992

    accuracy                           0.71      2284
   macro avg       0.70      0.70      0.70      2284
weighted avg       0.71      0.71      0.71      2284



In [33]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)
y_pred=model2.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.77      0.87      0.82      1292
           1       0.80      0.67      0.72       992

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.78      2284



In [34]:
model3 = RandomForestClassifier(n_estimators = 100)
model3.fit(X_train, y_train)
y_pred=model3.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.76      0.83      0.80      1292
           1       0.75      0.67      0.71       992

    accuracy                           0.76      2284
   macro avg       0.76      0.75      0.75      2284
weighted avg       0.76      0.76      0.76      2284



In [35]:
model4 = RidgeClassifier()
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)
cf = classification_report(y_test, y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.75      0.83      0.79      1292
           1       0.74      0.64      0.69       992

    accuracy                           0.75      2284
   macro avg       0.75      0.74      0.74      2284
weighted avg       0.75      0.75      0.75      2284



### Running the models with a lemmatizer and Tfidf Vectorizer

In [36]:
tf = TfidfVectorizer()
X1_idf = tf.fit_transform(trainclean1).toarray()
y3 = dftrain1["target"]
X_train, X_test, y_train, y_test = train_test_split(X1_idf, y3, test_size=0.3)

In [37]:
model=MultinomialNB()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1332
           1       0.83      0.66      0.73       952

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.79      2284
weighted avg       0.80      0.80      0.80      2284



In [38]:
model1 = DecisionTreeClassifier()
model1.fit(X_train, y_train)
y_pred=model1.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.79      0.72      0.75      1332
           1       0.65      0.73      0.69       952

    accuracy                           0.72      2284
   macro avg       0.72      0.72      0.72      2284
weighted avg       0.73      0.72      0.73      2284



In [39]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)
y_pred=model2.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.79      0.92      0.85      1332
           1       0.85      0.65      0.74       952

    accuracy                           0.81      2284
   macro avg       0.82      0.79      0.79      2284
weighted avg       0.81      0.81      0.80      2284



In [40]:
model3 = RandomForestClassifier(n_estimators = 100)
model3.fit(X_train, y_train)
y_pred=model3.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.79      0.84      0.82      1332
           1       0.76      0.69      0.72       952

    accuracy                           0.78      2284
   macro avg       0.78      0.77      0.77      2284
weighted avg       0.78      0.78      0.78      2284



In [41]:
model4 = RidgeClassifier()
model4.fit(X_train, y_train)
y_pred = model4.predict(X_test)
cf = classification_report(y_test, y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.80      0.89      0.84      1332
           1       0.81      0.69      0.75       952

    accuracy                           0.81      2284
   macro avg       0.81      0.79      0.79      2284
weighted avg       0.81      0.81      0.80      2284



### Best F1-score is Logistic Regression with a Tfidf vectorizer
### Let's test the model 

In [42]:
x = dftrain1["text"]
y = dftrain1["target"]
x_final = dftest1["text"]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [44]:
tf = TfidfVectorizer()
X_train = tf.fit_transform(X_train).toarray()
X_test = tf.transform(X_test).toarray()
X_pred = tf.transform(x_final).toarray()

In [46]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
finalmodel = lr.predict(X_pred)
finalmodel

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

In [47]:
submission = dftest1[['id']].reset_index(drop=True)
submission['target'] = finalmodel.astype('int64')

In [52]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [51]:
submission.to_csv("submission.csv", index=False)