<a href="https://www.kaggle.com/code/abdulaziz04/featureextraction-randomforestclassifier?scriptVersionId=101227185" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# Read the CSV Files

In [2]:
xtrain=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
xtest=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [3]:
xtrain.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Create a features dataset containing 'text' 'location' and 'keyword'

In [4]:
y=xtrain['target']
ids=xtest['id']
xtrain.drop(['id','target'],inplace=True,axis=1)
xtest.drop('id',inplace=True,axis=1)
xtrain

Unnamed: 0,keyword,location,text
0,,,Our Deeds are the Reason of this #earthquake M...
1,,,Forest fire near La Ronge Sask. Canada
2,,,All residents asked to 'shelter in place' are ...
3,,,"13,000 people receive #wildfires evacuation or..."
4,,,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...
7608,,,Two giant cranes holding a bridge collapse int...
7609,,,@aria_ahrary @TheTawniest The out of control w...
7610,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,,,Police investigating after an e-bike collided ...


## Check for null values and perform necessary imputations

In [5]:
xtrain.isnull().sum()

keyword       61
location    2533
text           0
dtype: int64

## Get the top 15 values and visualize their frequency
For Categorical values,we generally use Mode

In [6]:
top_locations=xtrain['location'].value_counts().index.to_list()[:15]
top_loc_df=xtrain[xtrain.location.isin(top_locations)]
top_loc_df.head()

Unnamed: 0,keyword,location,text
51,ablaze,India,Man wife get six years jail for setting ablaze...
55,ablaze,USA,#Kurds trampling on Turkmen flag later set it ...
70,accident,UK,http://t.co/GKYe6gjTk5 Had a #personalinjury a...
73,accident,Australia,BigRigRadio Live Accident Awareness
100,accident,UK,.@NorwayMFA #Bahrain police had previously die...


In [7]:
px.histogram(top_loc_df,x='location',color='location')

In [8]:
xtrain['location'].fillna(xtrain['location'].mode()[0],inplace=True)
xtest['location'].fillna(xtrain['location'].mode()[0],inplace=True)


## Same for the 'Keyword' column, here the frequency is almost same so we can use multiple values to perform imputation but as the missing count is too small, we will use single mode only

In [9]:
top_keywords=xtrain['keyword'].value_counts().index.to_list()[:15]
top_keywords_df=xtrain[xtrain.keyword.isin(top_keywords)]
top_keywords_df.head()

Unnamed: 0,keyword,location,text
304,armageddon,"California, United States",#PBBan (Temporary:300) avYsss @'aRmageddon | D...
305,armageddon,"California, United States",#PBBan (Temporary:300) Russaky89 @'aRmageddon ...
306,armageddon,#FLIGHTCITY UK,((OFFICIAL VID)) #DoubleCups &gt;&gt; https://...
307,armageddon,USA,ouvindo Peace Love &amp; Armageddon
308,armageddon,USA,Best movie you've ever seen? - Armageddon htt...


In [10]:
px.histogram(top_keywords_df,x='keyword',color='keyword')

In [11]:
xtrain['keyword'].fillna(xtrain['keyword'].mode()[0],inplace=True)
xtest['keyword'].fillna(xtrain['keyword'].mode()[0],inplace=True)

## Now our dataset is free of all missing values

In [12]:
xtrain.isnull().sum()

keyword     0
location    0
text        0
dtype: int64

## Replacing unnecessary characters with blank space, helps in better tokenization

In [13]:

xtrain['text']=xtrain.text.str.replace('[^a-zA-Z0-9\s#]', '',regex=True)
xtrain['location']=xtrain.location.str.replace('[^a-zA-Z0-9\s]', '',regex=True)
xtrain['keyword']=xtrain.keyword.str.replace('[^a-zA-Z0-9\s]', '',regex=True)


In [14]:

xtest['text']=xtest.text.str.replace('[^a-zA-Z0-9\s#]', '',regex=True)
xtest['location']=xtest.location.str.replace('[^a-zA-Z0-9\s]', '',regex=True)
xtest['keyword']=xtest.keyword.str.replace('[^a-zA-Z0-9\s]', '',regex=True)


## Stripping location text again as there were a few blank space tokens

In [15]:

xtrain['location']=xtrain['location'].str.strip()
xtest['location']=xtest['location'].str.strip()


In [16]:
xtrain.text[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

## TFIDF provides the measure of occurence of a particular word whereas word_tokenize tokenizes a given sentence to a set of tokens

In [17]:
tfidf = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

#For Train Set
train_texts=tfidf.fit_transform(xtrain['text'])
train_texts=pd.DataFrame(train_texts.toarray(),columns=tfidf.get_feature_names_out())

#For Test Set
test_texts=tfidf.fit_transform(xtest['text'])
test_texts=pd.DataFrame(test_texts.toarray(),columns=tfidf.get_feature_names_out())

#Left Join
train_texts,test_texts=train_texts.align(test_texts,join='left',axis=1)

## Fill useless features with zero values because similar train set features are not observed in test set

In [18]:
test_texts.fillna(0,inplace=True)
test_texts.isnull().sum().sum()

0

## One Hot Encoding the location and keyword column to generate a new set of features

In [19]:
ohe_train_location=pd.get_dummies(xtrain['location'])
ohe_test_location=pd.get_dummies(xtest['location'])
train_locations,test_locations=ohe_train_location.align(ohe_test_location, join='left', axis=1)

In [20]:
test_locations.fillna(0,inplace=True)
test_locations.isnull().sum().sum()

0

In [21]:
ohe_train_keywords=pd.get_dummies(xtrain['keyword'])
ohe_test_keywords=pd.get_dummies(xtest['keyword'])
train_keywords,test_keywords=ohe_train_location.align(ohe_test_location, join='left', axis=1)

In [22]:
test_keywords.fillna(0,inplace=True)
test_keywords.isnull().sum().sum()

0

## Building the Final DataFrame with all necessary features

In [23]:
final_train_df=pd.concat([train_texts,train_locations,train_keywords],axis=1)
final_test_df=pd.concat([test_texts,test_locations,test_keywords],axis=1)
final_train_df

Unnamed: 0,#,0,0011,001116,0025,005225,010156,010217,0104,010401,...,wwwfacebookcomstuntfm,wwwtmgcgartcom,wwwtwitchtvPKSparkxx,wwwyoutubecomMalkavius2,xiumins nonexistent solos,yel,yorkshire,your boyfriends legs,youtubecomchannelUCHWTLC9B4ZjUGh7yDlb55Iw,zboyerwashingtontimescom
0,0.116225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.130358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.193995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
7609,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0
7610,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.407311,0.0,...,0,0,0,0,0,0,0,0,0,0
7611,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0,0,0,0,0,0,0,0,0,0


## Fitting the Data and calculating Cross validation score

In [24]:

model=RandomForestClassifier(n_jobs=-1,random_state=1)
model.fit(final_train_df,y)
scores = cross_val_score(model, final_train_df, y, cv=5)
scores.mean()


0.6946070027428746

## Calculating F1 score

In [25]:
train_results=model.predict(final_train_df)

In [26]:
f1_train=f1_score(y,train_results)
print(f'F1 score on train set {f1_train}')

F1 score on train set 0.9975542647508407


In [27]:
test_results=model.predict(final_test_df)

# Submission

In [28]:

result_df=pd.DataFrame({'id':ids,'target':test_results})
result_df.to_csv('submission.csv',index=False)
print('File generated successfully ! ')


File generated successfully ! 
