In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv") 
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [4]:
train_df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [5]:
train_df = train_df.fillna("")

In [6]:
train_df.isnull().sum()

id          0
keyword     0
location    0
text        0
target      0
dtype: int64

In [7]:
import nltk
#nltk.download('wordnet')
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [8]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lm= WordNetLemmatizer()
import emoji

def preprocess_text_column(df, column_name):
    # Lowercasing
    df[column_name] = df[column_name].str.lower()

    # Remove URLs
    url_pattern = r'http[s]?://\S+'
    df[column_name] = df[column_name].apply(lambda x: re.sub(url_pattern, '', x))

    # Remove HTML tags
    html_pattern = r'<.*?>'
    df[column_name] = df[column_name].apply(lambda x: re.sub(html_pattern, '', x))

    # Remove mentions (e.g., @username)
    mention_pattern = r'@\w+'
    df[column_name] = df[column_name].apply(lambda x: re.sub(mention_pattern, '', x))

    # Remove punctuation
    punctuation_pattern = r'[^\w\s]'
    df[column_name] = df[column_name].apply(lambda x: re.sub(punctuation_pattern, '', x))

    # Remove emojis
    df[column_name] = df[column_name].apply(lambda x: emoji.replace_emoji(x,''))

    # Tokenization
    df[column_name] = df[column_name].apply(lambda x: word_tokenize(x))

    # Download stopwords and remove them
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    df[column_name] = df[column_name].apply(lambda x: [word for word in x if word not in stop_words])

    # Download WordNet for lemmatization
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    df[column_name] = df[column_name].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    # Join Tokens Back to Text
    df[column_name] = df[column_name].apply(lambda x: ' '.join(x))

    return df


In [9]:
train_df = preprocess_text_column(train_df, 'text')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive u,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,resident asked shelter place notified officer ...,1
3,6,,,13000 people receive wildfire evacuation order...,1
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1
...,...,...,...,...,...
7608,10869,,,two giant crane holding bridge collapse nearby...,1
7609,10870,,,control wild fire california even northern par...,1
7610,10871,,,m194 0104 utc5km volcano hawaii,1
7611,10872,,,police investigating ebike collided car little...,1


In [11]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [12]:
test_df.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [13]:
test_df = test_df.fillna("")

In [14]:
test_df = preprocess_text_column(test_df, 'text')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,happened terrible car crash
1,2,,,heard earthquake different city stay safe ever...
2,3,,,forest fire spot pond goose fleeing across str...
3,9,,,apocalypse lighting spokane wildfire
4,11,,,typhoon soudelor kill 28 china taiwan
...,...,...,...,...
3258,10861,,,earthquake safety los angeles ûò safety fasten...
3259,10865,,,storm ri worse last hurricane cityamp3others h...
3260,10868,,,green line derailment chicago
3261,10874,,,meg issue hazardous weather outlook hwo


In [16]:
X = train_df['text']
y = train_df['target']

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(X_train)
X_val = tfidf.transform(X_val)
X_test = tfidf.transform(test_df['text'])

In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_val)

In [25]:
from sklearn.metrics import accuracy_score, classification_report

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.82      0.68      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [27]:
data = pd.DataFrame()
data['text'] = test_df['text']

In [28]:
data['target'] = model.predict(X_test)

In [29]:
data

Unnamed: 0,text,target
0,happened terrible car crash,1
1,heard earthquake different city stay safe ever...,1
2,forest fire spot pond goose fleeing across str...,1
3,apocalypse lighting spokane wildfire,1
4,typhoon soudelor kill 28 china taiwan,1
...,...,...
3258,earthquake safety los angeles ûò safety fasten...,1
3259,storm ri worse last hurricane cityamp3others h...,1
3260,green line derailment chicago,1
3261,meg issue hazardous weather outlook hwo,1
