In [1]:
#importing the packages

import pandas as pd

In [2]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [3]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
#Extract the X and y

X=df['text']
y=df['target']

In [5]:
X.head()

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
2    All residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

In [6]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [7]:
#value counts of target 

df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
## splitting

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=42)

### Preprocessing

In [9]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [10]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ELCOT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# clean processing

def clean_review(review):
    tokens = review.lower().split()
    filtered_tokens = [lemmatizer.lemmatize(w)
                       for w in tokens if w not in stop_words]
    return " ".join(filtered_tokens)

In [12]:
X1=X.apply(clean_review)
X1

0             deed reason #earthquake may allah forgive u
1                  forest fire near la ronge sask. canada
2       resident asked 'shelter place' notified office...
3       13,000 people receive #wildfires evacuation or...
4       got sent photo ruby #alaska smoke #wildfires p...
                              ...                        
7608    two giant crane holding bridge collapse nearby...
7609    @aria_ahrary @thetawniest control wild fire ca...
7610    m1.94 [01:04 utc]?5km volcano hawaii. http://t...
7611    police investigating e-bike collided car littl...
7612    latest: home razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [13]:
#### Vectorizing

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=3, max_features=None,ngram_range=(1,2),use_idf=1)

In [14]:
x_train1 = vectorizer.fit_transform(X_train)
x_test1=vectorizer.transform(X_test)

In [15]:
### Model

from sklearn.naive_bayes import MultinomialNB
mnb= MultinomialNB()


In [16]:
model=mnb.fit(x_train1,y_train)

y_pred_mnb=model.predict(x_test1)

In [17]:
## Accuracy

from sklearn.metrics import accuracy_score
acc_mnb= accuracy_score(y_pred_mnb,y_test)
acc_mnb

0.8049901510177282

In [18]:
## Confusion matrix

from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_pred_mnb,y_test)
cm

array([[817, 240],
       [ 57, 409]], dtype=int64)

In [19]:
# classificaion report

from sklearn.metrics  import classification_report

cr=classification_report(y_test,y_pred_mnb)
print(cr)

              precision    recall  f1-score   support

           0       0.77      0.93      0.85       874
           1       0.88      0.63      0.73       649

    accuracy                           0.80      1523
   macro avg       0.83      0.78      0.79      1523
weighted avg       0.82      0.80      0.80      1523



In [20]:

test_texts = vectorizer.transform(test_df["text"])

In [21]:
submission["target"] = model.predict(test_texts)
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [26]:
submission.to_csv('Submission-1.csv', index=False)

In [27]:
df2=pd.read_csv('Submission-1.csv')
df2.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [28]:
df2.shape

(3263, 2)