In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
df_train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
df_train['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

In [6]:
df_train = df_train.drop(['keyword','location', 'id'], axis = 1)

In [7]:
df_train

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


In [8]:
df_test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [9]:
df_test = df_test.drop(['keyword','location', 'id'], axis = 1)

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3263 non-null   object
dtypes: object(1)
memory usage: 25.6+ KB


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
cv = CountVectorizer(stop_words='english')

In [14]:
import warnings
warnings.filterwarnings('ignore')
matrix = cv.fit_transform(df_train[df_train['target']==1]['text'])
freqs = zip(cv.get_feature_names(), matrix.sum(axis=0).tolist()[0])    

print("Top 20 words for disaster")
print(sorted(freqs, key=lambda x: -x[1])[:20])

Top 20 words for disaster
[('http', 2382), ('û_', 172), ('news', 140), ('amp', 135), ('https', 134), ('disaster', 121), ('california', 115), ('suicide', 112), ('police', 109), ('people', 105), ('killed', 95), ('like', 94), ('hiroshima', 92), ('just', 89), ('storm', 89), ('fires', 86), ('crash', 85), ('families', 81), ('train', 79), ('emergency', 77)]


In [15]:
matrix = cv.fit_transform(df_train[df_train['target']==0]['text'])
freqs = zip(cv.get_feature_names(), matrix.sum(axis=0).tolist()[0])    

print("Top 20 words used for no disaster.")
print(sorted(freqs, key=lambda x: -x[1])[:20])

Top 20 words used for no disaster.
[('http', 1927), ('https', 277), ('like', 254), ('just', 234), ('amp', 209), ('û_', 176), ('new', 170), ('don', 138), ('body', 116), ('video', 96), ('people', 95), ('love', 90), ('day', 86), ('know', 86), ('time', 85), ('got', 84), ('emergency', 81), ('going', 76), ('let', 76), ('youtube', 76)]


In [16]:
X = df_train['text']
y = df_train['target']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train = df_train['text']
y_train = df_train['target']
X_test = df_test['text']

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [21]:
tfidf = TfidfVectorizer()

In [22]:
X_train

3524    WFTV Eyewitness News: TN school psychologist a...
6881    I'm that traumatised that I can't even spell p...
2697    New SMP Ignition Knock (Detonation) Sensor KS3...
3225    #Healthcare #Job in #Kodiak AK: Emergency Serv...
3795    #SigAlert: North &amp; Southbound 133 closed b...
                              ...                        
6832                         trapped in its disappearance
4230    MEG issues Hazardous Weather Outlook (HWO)  ht...
1318    @nagel_ashley @Vicken52 @BasedLaRock @goonc1ty...
4289    HELLFIRE EP - SILENTMIND &amp; @_bookofdaniel ...
5515    gmtTy mhtw4fnet\n\nOfficials: Alabama home qua...
Name: text, Length: 6851, dtype: object

In [23]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
X_val_tfidf = tfidf.transform(X_val)
X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()
X_val_tfidf = X_val_tfidf.toarray()

In [24]:
X_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
import tensorflow as tf
from tensorflow import keras

In [27]:
X_train_tfidf.shape

(6851, 20188)

In [127]:
model = keras.Sequential([
    keras.layers.Dense(units = 1, activation='relu', input_dim = X_train_tfidf.shape[1]),
    keras.layers.Dense(units = 8, activation='relu'),
    keras.layers.Dense(units = 16, activation='relu'),
    keras.layers.Dense(units = 32, activation='relu'),
    keras.layers.Dense(units = 64, activation='relu'),
    keras.layers.Dense(units = 1, activation = 'sigmoid')
])
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [128]:
model.fit(X_train_tfidf, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1d87fe55150>

In [129]:
from sklearn.metrics import classification_report,plot_confusion_matrix

In [130]:
y_pred_prob = model.predict(X_test_tfidf)



In [131]:
y_pred = (y_pred_prob > 0.5).astype(int)

In [132]:
y_pred_prob1 = model.predict(X_val_tfidf)



In [133]:
y_pred1 = (y_pred_prob1 > 0.5).astype(int)

In [134]:
print(classification_report(y_val, y_pred1))

              precision    recall  f1-score   support

           0       0.77      0.76      0.76       438
           1       0.68      0.69      0.68       324

    accuracy                           0.73       762
   macro avg       0.72      0.72      0.72       762
weighted avg       0.73      0.73      0.73       762



In [135]:
sub = pd.read_csv('sample_submission.csv')

In [136]:
sub['id'].shape

(3263,)

In [137]:
y_pred.shape

(3263, 1)

In [138]:
y_pred = y_pred.flatten()

In [139]:
y_pred.shape

(3263,)

In [140]:
sub['target'] = y_pred

In [141]:
sub.to_csv('nn.csv', index=False, index_label=False)

In [158]:
X = df_train['text']
y = df_train['target']

In [159]:
from sklearn.model_selection import train_test_split

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [161]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

In [162]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('nb', MultinomialNB())])

In [163]:
params = {
    'nb__alpha':[0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 1.0],
    'nb__fit_prior':[True, False], 
    'nb__class_prior':[None, 0.2, 0.3, 0.4, 0.5, 0.7, 1.0]
}

In [164]:
full_model = GridSearchCV(pipe, param_grid=params, cv = 3, verbose = 2)

In [165]:
full_model.fit(X_train, y_train)  

Fitting 3 folds for each of 98 candidates, totalling 294 fits
[CV] END nb__alpha=0.1, nb__class_prior=None, nb__fit_prior=True; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=None, nb__fit_prior=True; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=None, nb__fit_prior=True; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=None, nb__fit_prior=False; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=None, nb__fit_prior=False; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=None, nb__fit_prior=False; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=0.2, nb__fit_prior=True; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=0.2, nb__fit_prior=True; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=0.2, nb__fit_prior=True; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=0.2, nb__fit_prior=False; total time=   0.0s
[CV] END nb__alpha=0.1, nb__class_prior=0.2, nb__fit_prior=False; total time=   0.0s
[CV

In [149]:
full_model.best_params_

{'nb__alpha': 0.7, 'nb__class_prior': None, 'nb__fit_prior': True}

In [150]:
from sklearn.metrics import classification_report,plot_confusion_matrix

In [155]:
pipe

In [154]:
preds = full_model.predict(X_test_tfidf)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [156]:
y_val

5320    0
1025    0
4905    1
4386    0
6617    1
       ..
644     0
3457    1
3053    1
3102    0
1360    1
Name: target, Length: 762, dtype: int64

In [157]:
preds

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

In [153]:
print(classification_report(y_val,preds))

ValueError: Found input variables with inconsistent numbers of samples: [762, 3263]