In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#import spacy

### import data and feature engineering

In [2]:
train_df=pd.read_csv('train.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# cleaning the text
train_df['text']=train_df['text'].str.replace('[^a-zA-Z]',' ') 



# converting the text in vectorial form
tf=TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_features=1000)   
tf_vec=tf.fit_transform(train_df['text'])

# build a new DataFrame with the converted text
text_df=pd.DataFrame(tf_vec.toarray(),columns=tf.get_feature_names()).add_prefix('TXT_')

print(text_df.shape)
text_df.head()

(7613, 1000)


Unnamed: 0,TXT_abc,TXT_abc news,TXT_ablaze,TXT_accident,TXT_action,TXT_actually,TXT_added,TXT_affected,TXT_affected fatal,TXT_aftershock,...,TXT_years,TXT_yes,TXT_york,TXT_young,TXT_youth,TXT_youtube,TXT_youtube video,TXT_yr,TXT_yr old,TXT_zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# concatenating the two dataframes
new_df=pd.concat([train_df,text_df],axis=1)
new_df.drop('text',axis=1,inplace=True)

### preparing features and targets for the machine learning 

In [5]:
y=new_df['target']
X=new_df.drop(['keyword','location','target','id'],axis=1)

### building the algorithm

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import MultinomialNB

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,stratify=y)

rf=MultinomialNB()

rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)

print("----------------------------------------------------") 
print("Confusion matrix \n"+str(confusion_matrix(y_pred,y_test))) 
print("----------------------------------------------------") 
print("Report \n"+str(classification_report(y_test,y_pred))) 
print("----------------------------------------------------")

#crosscv=cross_validate(rf,X,y,cv=5)['test_score'].mean()
#print('The cross validation score is '+str(crosscv))

----------------------------------------------------
Confusion matrix 
[[956 284]
 [130 534]]
----------------------------------------------------
Report 
              precision    recall  f1-score   support

           0       0.77      0.88      0.82      1086
           1       0.80      0.65      0.72       818

    accuracy                           0.78      1904
   macro avg       0.79      0.77      0.77      1904
weighted avg       0.79      0.78      0.78      1904

----------------------------------------------------


### creating the submission

In [8]:
def predictions(test_df):
    
    aa=np.array(test_df['id'])
    
    test_df['text']=test_df['text'].str.replace('[^a-zA-Z]',' ') 

    # converting the text in tabular form 
    tf_vec=tf.transform(test_df['text'])

    # build a new DataFrame with the converted text
    text_df=pd.DataFrame(tf_vec.toarray(),columns=tf.get_feature_names()).add_prefix('TXT_')

    # concatenating the 2 data frames
    new_df=pd.concat([test_df,text_df],axis=1)
    new_df.drop('text',axis=1,inplace=True)

    X=new_df.drop(['keyword','location','id'],axis=1)
    bb=rf.predict(X)
    
    return aa,bb,X

### final predictions using the test dataset

In [9]:
test_df=pd.read_csv('test.csv')

peppe=predictions(test_df)

risultati=pd.DataFrame(list(zip(peppe[0],peppe[1])),columns=['id','target'])

risultati.to_csv('risultati.csv',index=False)