In [8]:
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [9]:
twitter_df = pd.read_csv(r'E:\Python\Datasets\Disaster_Tweets\train.csv')
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [11]:
twitter_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

## Text preprocessing

#### 1. Lower case

In [12]:
# Lower case

twitter_df['cleaned_text'] = twitter_df['text'].apply(lambda x: x.lower())
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...


#### 2. Expand the Contrations

In [13]:
# Expand the Contrations
import contractions

In [14]:
# Test
contractions.fix("I'd like to know how I'd done that!")

'I would like to know how I would done that!'

In [15]:
# Expanding the Contrations
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(contractions.fix)

In [19]:
# Checking contractions
print(twitter_df["text"][67])
print(twitter_df["cleaned_text"][67])

'I can't have kids cuz I got in a bicycle accident &amp; split my testicles. it's impossible for me to have kids' MICHAEL YOU ARE THE FATHER
'i can not have kids cuz i got in a bicycle accident &amp; split my testicles. it is impossible for me to have kids' michael you are the father


#### 3. Noise Removal

3.1 Remove URLs

In [22]:
import re
def remove_URL(text):
    """
        Remove URLs from a simple string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)

In [23]:
# remove url from the text
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(remove_URL)

In [24]:
# Checking
print(twitter_df["text"][31])
print(twitter_df["cleaned_text"][31])

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
@bbcmtd wholesale markets ablaze 


3.2 Remove HTML tags

In [26]:
def remove_html_tag(text):
    """
        Remove html tags from a simple text
    """
    html_tag = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html_tag, "", text)

In [27]:
# remove html tags from text
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(remove_html_tag)

In [29]:
# Checking
print(twitter_df["text"][62])
print(twitter_df["cleaned_text"][62])

Rene Ablaze &amp; Jacinta - Secret 2k13 (Fallen Skies Edit) - Mar 30 2013  https://t.co/7MLMsUzV1Z
rene ablaze  jacinta - secret 2k13 (fallen skies edit) - mar 30 2013  


3.3 Remove Non-ASCI:

In [31]:
def removal_non_ascii(text):
    return re.sub(r'[^\x00-\x7f]', '', text)

In [32]:
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(removal_non_ascii)

In [34]:
# Checking
print(twitter_df["text"][38])
print(twitter_df["cleaned_text"][38])

Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J
barbados #bridgetown jamaica  two cars set ablaze: santa cruz  head of the st elizabeth police superintende...  


#### 4. Remove punctuations

In [37]:
import string
def remove_punc(text):
#     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans('', '', string.punctuation))

In [38]:
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(remove_punc)

In [39]:
# Checking
print(twitter_df["text"][5])
print(twitter_df["cleaned_text"][5])

#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
rockyfire update  california hwy 20 closed in both directions due to lake county fire  cafire wildfires


#### 5. Tokeinzation

In [46]:
from nltk.tokenize import word_tokenize

twitter_df['tokenized'] = twitter_df['cleaned_text'].apply(word_tokenize)
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[just, got, sent, this, photo, from, ruby, ala..."


#### 6.Removing stopwords

In [41]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maksb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
stop = set(stopwords.words('english'))
twitter_df['stopwords_removed'] = twitter_df['tokenized'].apply(lambda x: [word for word in x if word not in stop])
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized,stopwords_removed
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [9]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(twitter_df['text'], twitter_df['target'], random_state = 0)

In [10]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(X_train)
test_vectors = count_vectorizer.transform(X_test)

In [11]:
ridge_model_clf = linear_model.RidgeClassifier()

In [12]:
cv_scores = model_selection.cross_val_score(ridge_model_clf, train_vectors, y_train, cv = 3, scoring='f1')

In [13]:
cv_scores

array([0.70415335, 0.72380952, 0.73561732])

In [15]:
ridge_model_clf.fit(train_vectors, y_train)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [16]:
prediction = ridge_model_clf.predict(test_vectors)

In [17]:
from sklearn.metrics import f1_score

In [18]:
f1_score(prediction, y_test)

0.7385837193911318

In [19]:
twitter_test = pd.read_csv(r'E:\Python\Datasets\Disaster_Tweets\test.csv')


In [21]:
twitter_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [36]:
from sklearn.model_selection import GridSearchCV

In [41]:
ridge_model_clf.get_params()

{'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [49]:
import numpy as np


In [52]:
from sklearn.pipeline import Pipeline

In [78]:
ridge_model_clf = linear_model.RidgeClassifier()
count_vectorizer = feature_extraction.text.CountVectorizer()

pipeline = Pipeline([('CountVectorizer', feature_extraction.text.CountVectorizer()), ('classifier', linear_model.RidgeClassifier())])

In [85]:
ridge_model_clf.get_params().keys()

dict_keys(['alpha', 'class_weight', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])

In [None]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

In [110]:
parametrs = {'classifier__alpha': list(np.linspace(20,35,30)),
             'classifier__class_weight': [None, 'balanced'],
             'classifier__copy_X': [True],
             'classifier__fit_intercept': [True],
             'classifier__max_iter': [None],
             'classifier__normalize': [False],
             'classifier__random_state': [0],
             'classifier__solver': ['auto'],
             'classifier__tol': [0.001]}
clf_grid = GridSearchCV(pipeline, param_grid=parametrs, cv = 3)
clf_grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('CountVectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                 

In [113]:
clf_grid.best_params_

{'classifier__alpha': 21.03448275862069,
 'classifier__class_weight': None,
 'classifier__copy_X': True,
 'classifier__fit_intercept': True,
 'classifier__max_iter': None,
 'classifier__normalize': False,
 'classifier__random_state': 0,
 'classifier__solver': 'auto',
 'classifier__tol': 0.001}

In [117]:
clf_best = linear_model.RidgeClassifier(alpha = 21)

In [118]:
clf_best.fit(train_vectors, y_train)


RidgeClassifier(alpha=21, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [120]:
f1_score(clf_best.predict(test_vectors),y_test)

0.761049723756906

In [125]:
cv_scores = model_selection.cross_val_score(clf_best, train_vectors, y_train, cv = 3, scoring='f1')
cv_scores.mean()

0.7445634660882164

In [109]:
clf_grid.best_params_

{'classifier__alpha': 24.655172413793103,
 'classifier__class_weight': 'balanced',
 'classifier__copy_X': True,
 'classifier__fit_intercept': True,
 'classifier__max_iter': None,
 'classifier__normalize': False,
 'classifier__random_state': 0,
 'classifier__solver': 'auto',
 'classifier__tol': 0.001}

In [74]:
parametrs = {'alpha': [0.1],
             'class_weight': [None, 'balanced'],
             'copy_X': [True],
             'fit_intercept': [True],
             'max_iter': [None],
             'normalize': [False],
             'random_state': [0],
             'solver': ['auto'],
             'tol': [0.001]}
clf_grid = GridSearchCV(ridge_model_clf, param_grid=parametrs, cv = 3)
clf_grid.fit(train_vectors, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RidgeClassifier(alpha=1.0, class_weight=None,
                                       copy_X=True, fit_intercept=True,
                                       max_iter=None, normalize=False,
                                       random_state=None, solver='auto',
                                       tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.1], 'class_weight': [None, 'balanced'],
                         'copy_X': [True], 'fit_intercept': [True],
                         'max_iter': [None], 'normalize': [False],
                         'random_state': [0], 'solver': ['auto'],
                         'tol': [0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [77]:
clf_grid.best_score_

0.7565247854265196

## Making submission prediction

In [128]:
count_vectorizer_sumbission = feature_extraction.text.CountVectorizer()
submission_train_vectors = count_vectorizer_sumbission.fit_transform(twitter_df['text'])
submission_test_vectors = count_vectorizer_sumbission.transform(twitter_test['text'])

In [129]:
clf_best.fit(submission_train_vectors, twitter_df['target'])
submission_prediction = clf_best.predict(submission_test_vectors)

In [130]:
submission_prediction

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

In [131]:
twitter_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [132]:
submission_df = twitter_test[['id']]

In [133]:
submission_df.head()

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11


In [134]:
submission_df = twitter_test[['id']].copy()
submission_df['target'] = submission_prediction

In [135]:
submission_df.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1


In [136]:
submission_df.to_csv(r'E:\Python\Datasets\Disaster_Tweets\submission.csv', index=False)