In [1]:
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
twitter_df = pd.read_csv(r'E:\Python\Datasets\Disaster_Tweets\train.csv')
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [78]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     7613 non-null   int64 
 1   keyword                7552 non-null   object
 2   location               5080 non-null   object
 3   text                   7613 non-null   object
 4   target                 7613 non-null   int64 
 5   cleaned_text           7613 non-null   object
 6   tokenized              7613 non-null   object
 7   stopwords_removed      7613 non-null   object
 8   porter_stemmer         7613 non-null   object
 9   lemmatize_word         7613 non-null   object
 10  porter_stemmer_joined  7613 non-null   object
 11  lemmatize_word_joined  7613 non-null   object
dtypes: int64(2), object(10)
memory usage: 713.8+ KB


In [4]:
twitter_df['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

## Text preprocessing

#### 1. Lower case

In [5]:
# Lower case

twitter_df['cleaned_text'] = twitter_df['text'].apply(lambda x: x.lower())
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this #earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask. canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby #alaska as ...


#### 2. Expand the Contrations

In [6]:
# Expand the Contrations
import contractions

In [7]:
# Test
contractions.fix("I'd like to know how I'd done that!")

'I would like to know how I would done that!'

In [8]:
# Expanding the Contrations
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(contractions.fix)

In [9]:
# Checking contractions
print(twitter_df["text"][67])
print(twitter_df["cleaned_text"][67])

'I can't have kids cuz I got in a bicycle accident &amp; split my testicles. it's impossible for me to have kids' MICHAEL YOU ARE THE FATHER
'i can not have kids cuz i got in a bicycle accident &amp; split my testicles. it is impossible for me to have kids' michael you are the father


#### 3. Noise Removal

3.1 Remove URLs

In [10]:
import re
def remove_URL(text):
    """
        Remove URLs from a simple string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)

In [11]:
# remove url from the text
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(remove_URL)

In [12]:
# Checking
print(twitter_df["text"][31])
print(twitter_df["cleaned_text"][31])

@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
@bbcmtd wholesale markets ablaze 


3.2 Remove HTML tags

In [13]:
def remove_html_tag(text):
    """
        Remove html tags from a simple text
    """
    html_tag = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html_tag, "", text)

In [14]:
# remove html tags from text
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(remove_html_tag)

In [15]:
# Checking
print(twitter_df["text"][62])
print(twitter_df["cleaned_text"][62])

Rene Ablaze &amp; Jacinta - Secret 2k13 (Fallen Skies Edit) - Mar 30 2013  https://t.co/7MLMsUzV1Z
rene ablaze  jacinta - secret 2k13 (fallen skies edit) - mar 30 2013  


3.3 Remove Non-ASCI:

In [16]:
def removal_non_ascii(text):
    return re.sub(r'[^\x00-\x7f]', '', text)

In [17]:
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(removal_non_ascii)

In [18]:
# Checking
print(twitter_df["text"][38])
print(twitter_df["cleaned_text"][38])

Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J
barbados #bridgetown jamaica  two cars set ablaze: santa cruz  head of the st elizabeth police superintende...  


#### 4. Remove punctuations

In [19]:
import string
def remove_punc(text):
#     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans('', '', string.punctuation))

In [20]:
twitter_df['cleaned_text'] = twitter_df['cleaned_text'].apply(remove_punc)

In [21]:
# Checking
print(twitter_df["text"][5])
print(twitter_df["cleaned_text"][5])

#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
rockyfire update  california hwy 20 closed in both directions due to lake county fire  cafire wildfires


#### 5. Tokeinzation

In [22]:
from nltk.tokenize import word_tokenize

twitter_df['tokenized'] = twitter_df['cleaned_text'].apply(word_tokenize)
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[just, got, sent, this, photo, from, ruby, ala..."


#### 6.Removing stopwords

In [23]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maksb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
stop = set(stopwords.words('english'))
twitter_df['stopwords_removed'] = twitter_df['tokenized'].apply(lambda x: [word for word in x if word not in stop])
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized,stopwords_removed
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi..."


#### 7. Stemming

In [25]:
from nltk.stem import PorterStemmer

def porter_stemmer(text):
    
    stemmer = PorterStemmer()
    stems = [stemmer.stem(word) for word in text]
    return stems

In [26]:
twitter_df['porter_stemmer'] = twitter_df['stopwords_removed'].apply(porter_stemmer)
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized,stopwords_removed,porter_stemmer
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[resid, ask, shelter, place, notifi, offic, ev..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation...","[13000, peopl, receiv, wildfir, evacu, order, ..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi..."


### 8. Lemmatization

In [27]:
from nltk.stem import WordNetLemmatizer

def lemmatize_word(text):
    """
        Lemmatize the tokenized words
    """

    lemmatizer = WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word, tag) for word, tag in text]
    return lemma

In [28]:
lemmatizer = WordNetLemmatizer()

twitter_df['lemmatize_word'] = twitter_df['stopwords_removed'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
twitter_df.head()

Unnamed: 0,id,keyword,location,text,target,cleaned_text,tokenized,stopwords_removed,porter_stemmer,lemmatize_word
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]","[deed, reason, earthquake, may, allah, forgive..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[all, residents, asked, to, shelter, in, place...","[residents, asked, shelter, place, notified, o...","[resid, ask, shelter, place, notifi, offic, ev...","[resident, asked, shelter, place, notified, of..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfires, evacuation...","[13000, people, receive, wildfires, evacuation...","[13000, peopl, receiv, wildfir, evacu, order, ...","[13000, people, receive, wildfire, evacuation,..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi...","[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [29]:
twitter_df['porter_stemmer_joined'] = twitter_df['porter_stemmer'].apply(lambda x: ' '.join(x))
twitter_df['lemmatize_word_joined'] = twitter_df['lemmatize_word'].apply(lambda x: ' '.join(x))

### 9. CountVectorizer

In [30]:
from sklearn.pipeline import Pipeline

In [31]:
def check_preporcessing_methods_with_prediction(X, y, alpha = list(np.linspace(1,50,10))):
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state = 1)
    pipeline = Pipeline([('CountVectorizer', feature_extraction.text.CountVectorizer()), ('classifier', linear_model.RidgeClassifier())])
    parametrs = {'classifier__alpha': alpha,
             'classifier__class_weight': [None, 'balanced'],
             'classifier__copy_X': [True],
             'classifier__fit_intercept': [True],
             'classifier__max_iter': [None],
             'classifier__normalize': [False],
             'classifier__random_state': [0],
             'classifier__solver': ['auto'],
             'classifier__tol': [0.001]}
    clf_grid = GridSearchCV(pipeline, param_grid=parametrs, cv = 3)
    clf_grid.fit(X_train, y_train)
    print('Best Score:', clf_grid.best_score_)
    print('Best Params', clf_grid.best_params_)

In [32]:
check_preporcessing_methods_with_prediction(twitter_df['porter_stemmer_joined'],twitter_df['target'], list(np.linspace(7,16,15)) )

Best Score: 0.7961114030478192
Best Params {'classifier__alpha': 14.071428571428573, 'classifier__class_weight': None, 'classifier__copy_X': True, 'classifier__fit_intercept': True, 'classifier__max_iter': None, 'classifier__normalize': False, 'classifier__random_state': 0, 'classifier__solver': 'auto', 'classifier__tol': 0.001}


In [33]:
check_preporcessing_methods_with_prediction(twitter_df['lemmatize_word_joined'],twitter_df['target'] ,list(np.linspace(24,28,15)))

Best Score: 0.7999649675950254
Best Params {'classifier__alpha': 27.428571428571427, 'classifier__class_weight': 'balanced', 'classifier__copy_X': True, 'classifier__fit_intercept': True, 'classifier__max_iter': None, 'classifier__normalize': False, 'classifier__random_state': 0, 'classifier__solver': 'auto', 'classifier__tol': 0.001}


In [34]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(twitter_df['text'], twitter_df['target'], random_state = 0)

In [35]:
count_vectorizer = feature_extraction.text.CountVectorizer()
train_vectors = count_vectorizer.fit_transform(X_train)
test_vectors = count_vectorizer.transform(X_test)

In [36]:
ridge_model_clf = linear_model.RidgeClassifier()

In [37]:
cv_scores = model_selection.cross_val_score(ridge_model_clf, train_vectors, y_train, cv = 3, scoring='f1')

In [38]:
cv_scores

array([0.70415335, 0.72426938, 0.7343245 ])

In [39]:
ridge_model_clf.fit(train_vectors, y_train)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [40]:
prediction = ridge_model_clf.predict(test_vectors)

In [41]:
from sklearn.metrics import f1_score

In [42]:
f1_score(prediction, y_test)

0.7390728476821192

In [43]:
twitter_test = pd.read_csv(r'E:\Python\Datasets\Disaster_Tweets\test.csv')


In [44]:
twitter_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [45]:
ridge_model_clf.get_params()

{'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

In [46]:


pipeline = Pipeline([('CountVectorizer', feature_extraction.text.CountVectorizer()), ('classifier', linear_model.RidgeClassifier())])

In [47]:
parametrs = {'classifier__alpha': list(np.linspace(1,50,10)),
             'classifier__class_weight': [None, 'balanced'],
             'classifier__copy_X': [True],
             'classifier__fit_intercept': [True],
             'classifier__max_iter': [None],
             'classifier__normalize': [False],
             'classifier__random_state': [0],
             'classifier__solver': ['auto'],
             'classifier__tol': [0.001]}
clf_grid = GridSearchCV(pipeline, param_grid=parametrs, cv = 3)
clf_grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('CountVectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                 

In [48]:
clf_grid.best_score_

0.8001401296198984

In [49]:
clf_grid.best_params_

{'classifier__alpha': 17.333333333333336,
 'classifier__class_weight': None,
 'classifier__copy_X': True,
 'classifier__fit_intercept': True,
 'classifier__max_iter': None,
 'classifier__normalize': False,
 'classifier__random_state': 0,
 'classifier__solver': 'auto',
 'classifier__tol': 0.001}

In [50]:
clf_best = linear_model.RidgeClassifier(alpha = 21)

In [51]:
clf_best.fit(train_vectors, y_train)


RidgeClassifier(alpha=21, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [52]:
f1_score(clf_best.predict(test_vectors),y_test)

0.761049723756906

In [53]:
cv_scores = model_selection.cross_val_score(clf_best, train_vectors, y_train, cv = 3, scoring='f1')
cv_scores.mean()

0.7451708565758928

In [54]:
clf_grid.best_params_

{'classifier__alpha': 17.333333333333336,
 'classifier__class_weight': None,
 'classifier__copy_X': True,
 'classifier__fit_intercept': True,
 'classifier__max_iter': None,
 'classifier__normalize': False,
 'classifier__random_state': 0,
 'classifier__solver': 'auto',
 'classifier__tol': 0.001}

In [55]:
parametrs = {'alpha': [0.1],
             'class_weight': [None, 'balanced'],
             'copy_X': [True],
             'fit_intercept': [True],
             'max_iter': [None],
             'normalize': [False],
             'random_state': [0],
             'solver': ['auto'],
             'tol': [0.001]}
clf_grid = GridSearchCV(ridge_model_clf, param_grid=parametrs, cv = 3)
clf_grid.fit(train_vectors, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RidgeClassifier(alpha=1.0, class_weight=None,
                                       copy_X=True, fit_intercept=True,
                                       max_iter=None, normalize=False,
                                       random_state=None, solver='auto',
                                       tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [0.1], 'class_weight': [None, 'balanced'],
                         'copy_X': [True], 'fit_intercept': [True],
                         'max_iter': [None], 'normalize': [False],
                         'random_state': [0], 'solver': ['auto'],
                         'tol': [0.001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [56]:
clf_grid.best_score_

0.7565247854265196

### 10. TF-IDF Vectorizer

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

def TFIDF(data, ngram = 1):
    tfidf_x = TfidfVectorizer(ngram_range = (ngram, ngram))
    emb = tfidf_x.fit_transform(data).toarray()
    return emb, tfidf_x

In [102]:
def check_preporcessing_methods_with_prediction(X, y, alpha = list(np.linspace(1,50,10))):
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state = 1)
    pipeline = Pipeline([('TfidfVectorizer', TfidfVectorizer()), ('classifier', linear_model.RidgeClassifier())])
    parametrs = {'classifier__alpha': alpha,
             'classifier__class_weight': [None, 'balanced'],
             'classifier__copy_X': [True],
             'classifier__fit_intercept': [True],
             'classifier__max_iter': [None],
             'classifier__normalize': [False],
             'classifier__random_state': [0],
             'classifier__solver': ['auto'],
             'classifier__tol': [0.001],
             'TfidfVectorizer__ngram_range': [(1,1), (1,2), (1,3)]}
    clf_grid = GridSearchCV(pipeline, param_grid=parametrs, cv = 3, scoring= 'f1')
    clf_grid.fit(X, y)
    print('Best Score:', clf_grid.best_score_)
    print('Best Params', clf_grid.best_params_)

In [103]:
check_preporcessing_methods_with_prediction(twitter_df['porter_stemmer_joined'],twitter_df['target'], list(np.linspace(0.1,20,15)) )

Best Score: 0.6684860934652413
Best Params {'TfidfVectorizer__ngram_range': (1, 3), 'classifier__alpha': 1.5214285714285714, 'classifier__class_weight': 'balanced', 'classifier__copy_X': True, 'classifier__fit_intercept': True, 'classifier__max_iter': None, 'classifier__normalize': False, 'classifier__random_state': 0, 'classifier__solver': 'auto', 'classifier__tol': 0.001}


In [104]:
check_preporcessing_methods_with_prediction(twitter_df['lemmatize_word_joined'],twitter_df['target'], list(np.linspace(0.1,20,15)) )

Best Score: 0.6671608529923245
Best Params {'TfidfVectorizer__ngram_range': (1, 3), 'classifier__alpha': 2.9428571428571426, 'classifier__class_weight': 'balanced', 'classifier__copy_X': True, 'classifier__fit_intercept': True, 'classifier__max_iter': None, 'classifier__normalize': False, 'classifier__random_state': 0, 'classifier__solver': 'auto', 'classifier__tol': 0.001}


### 11. Word2vec averaging

In [86]:
import gensim

In [87]:
word2vec_path = ('E:/Python/Pretrained_vectors/GoogleNews-vectors-negative300.bin')
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary = True, limit = None)

In [201]:
def get_average_vec(tokens_list, vector, generate_missing=False, k=300):
    """
        Calculate average embedding value of sentence from each word vector
    """
    
    if len(tokens_list)<1:
        return np.zeros(k)
    
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged.tolist()

In [202]:
def get_embeddings(vectors, text, generate_missing=False, k=300):
    """
        create the sentence embedding
    """
    embeddings = text.apply(lambda x: get_average_vec(x, vectors, generate_missing=generate_missing, k=k))
    return list(embeddings)

In [203]:
twitter_df['word2vec_averaged_vector'] = twitter_df['lemmatize_word'].apply(lambda x: get_average_vec(x,\
                                                    word2vec_model,generate_missing = True,k= 300))

In [190]:
# twitter_df['word2vec_averaged_vector'] = twitter_df['word2vec_averaged_vector'].apply(lambda x: x.astype('float64'))

In [227]:
def check_results(X, y, alpha = list(np.linspace(1,50,10))):
    X = X.tolist()

    parametrs = {'alpha': alpha,
             'class_weight': [None, 'balanced'],
             'copy_X': [True],
             'fit_intercept': [True],
             'max_iter': [None],
             'normalize': [False],
             'random_state': [0],
             'solver': ['auto'],
             'tol': [0.001]}
    clf_grid = GridSearchCV(linear_model.RidgeClassifier(), param_grid=parametrs, cv = 3, scoring= 'f1', error_score='raise')
    clf_grid.fit(X, y)
    print('Best Score:', clf_grid.best_score_)
    print('Best Params', clf_grid.best_params_)

In [231]:
check_results(twitter_df['word2vec_averaged_vector'],twitter_df['target'], list(np.linspace(40,50,20)) )

Best Score: 0.724441516783949
Best Params {'alpha': 45.26315789473684, 'class_weight': 'balanced', 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': False, 'random_state': 0, 'solver': 'auto', 'tol': 0.001}


In [220]:
ridge =linear_model.RidgeClassifier(alpha = 1)
train_vectors = twitter_df['word2vec_averaged_vector'].tolist()
y_train = twitter_df['target']
cv_scores = model_selection.cross_val_score(ridge, train_vectors, y_train, cv = 4, scoring='f1')

In [221]:
cv_scores

array([0.66441594, 0.68603214, 0.69149597, 0.74027604])

In [196]:
twi

array([1, 2, 3])

In [None]:
cv_scores = model_selection.cross_val_score(ridge_model_clf, train_vectors, y_train, cv = 3, scoring='f1')

In [141]:
for el in twitter_df['word2vec_averaged_vector'][:3]:
    print(el.dtype)

float64
float64
float64


In [206]:
twitter_df['word2vec_averaged_vector'][0]

[0.0872279554605484,
 0.02663748525083065,
 0.1637834757566452,
 0.1142229363322258,
 -0.080078125,
 0.0319998599588871,
 0.1100027933716774,
 -0.0509033203125,
 0.1780133992433548,
 0.1765485554933548,
 -0.0840715691447258,
 -0.1289760023355484,
 -0.1637137234210968,
 0.1197379007935524,
 -0.1378697007894516,
 0.1565987765789032,
 0.0750994011759758,
 0.0575125552713871,
 -0.0322636179625988,
 -0.1509748250246048,
 0.0736083984375,
 0.0992780402302742,
 0.2378627210855484,
 -0.0453578419983387,
 0.05859375,
 -0.04541015625,
 -0.0623604916036129,
 -0.0647670179605484,
 -0.0679234117269516,
 -0.118408203125,
 -0.0903669074177742,
 -0.0027378627564758062,
 -0.1729736328125,
 0.00250244140625,
 -0.0200020931661129,
 0.09637995809316635,
 0.0513044074177742,
 0.01032366044819355,
 0.0426112599670887,
 0.12689208984375,
 0.1547154039144516,
 0.0467093326151371,
 0.18359375,
 0.02054269053041935,
 0.1044049933552742,
 -0.15234375,
 -0.010027204640209675,
 -0.02840750478208065,
 -0.1530412882

In [135]:
b.astype('float64')

array([ 0.08722796,  0.02663749,  0.16378348,  0.11422294, -0.08007812,
        0.03199986,  0.11000279, -0.05090332,  0.1780134 ,  0.17654856,
       -0.08407157, -0.128976  , -0.16371372,  0.1197379 , -0.1378697 ,
        0.15659878,  0.0750994 ,  0.05751256, -0.03226362, -0.15097483,
        0.0736084 ,  0.09927804,  0.23786272, -0.04535784,  0.05859375,
       -0.04541016, -0.06236049, -0.06476702, -0.06792341, -0.1184082 ,
       -0.09036691, -0.00273786, -0.17297363,  0.00250244, -0.02000209,
        0.09637996,  0.05130441,  0.01032366,  0.04261126,  0.12689209,
        0.1547154 ,  0.04670933,  0.18359375,  0.02054269,  0.10440499,
       -0.15234375, -0.0100272 , -0.0284075 , -0.15304129, -0.06734794,
       -0.03759766,  0.04406738,  0.01819066,  0.09917777,  0.07883998,
        0.14004953, -0.16469029,  0.0165547 , -0.02604893, -0.1429269 ,
        0.06652614,  0.09927804, -0.07856369,  0.03336443, -0.0723005 ,
       -0.10081264,  0.03266253,  0.01604353, -0.06767927, -0.03

In [126]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        7613 non-null   int64 
 1   keyword                   7552 non-null   object
 2   location                  5080 non-null   object
 3   text                      7613 non-null   object
 4   target                    7613 non-null   int64 
 5   cleaned_text              7613 non-null   object
 6   tokenized                 7613 non-null   object
 7   stopwords_removed         7613 non-null   object
 8   porter_stemmer            7613 non-null   object
 9   lemmatize_word            7613 non-null   object
 10  porter_stemmer_joined     7613 non-null   object
 11  lemmatize_word_joined     7613 non-null   object
 12  word2vec_averaged_vector  7613 non-null   object
dtypes: int64(2), object(11)
memory usage: 773.3+ KB


In [124]:
a[0]

array([ 0.08722796,  0.02663749,  0.16378348,  0.11422294, -0.08007812,
        0.03199986,  0.11000279, -0.05090332,  0.1780134 ,  0.17654856,
       -0.08407157, -0.128976  , -0.16371372,  0.1197379 , -0.1378697 ,
        0.15659878,  0.0750994 ,  0.05751256, -0.03226362, -0.15097483,
        0.0736084 ,  0.09927804,  0.23786272, -0.04535784,  0.05859375,
       -0.04541016, -0.06236049, -0.06476702, -0.06792341, -0.1184082 ,
       -0.09036691, -0.00273786, -0.17297363,  0.00250244, -0.02000209,
        0.09637996,  0.05130441,  0.01032366,  0.04261126,  0.12689209,
        0.1547154 ,  0.04670933,  0.18359375,  0.02054269,  0.10440499,
       -0.15234375, -0.0100272 , -0.0284075 , -0.15304129, -0.06734794,
       -0.03759766,  0.04406738,  0.01819066,  0.09917777,  0.07883998,
        0.14004953, -0.16469029,  0.0165547 , -0.02604893, -0.1429269 ,
        0.06652614,  0.09927804, -0.07856369,  0.03336443, -0.0723005 ,
       -0.10081264,  0.03266253,  0.01604353, -0.06767927, -0.03

## Making submission prediction

In [63]:
count_vectorizer_sumbission = feature_extraction.text.CountVectorizer()

submission_train_vectors = count_vectorizer_sumbission.fit_transform(twitter_df['lemmatize_word_joined'])


In [66]:
TfidfVectorizer_sumbission = TfidfVectorizer(ngram_range = (1, 2))
submission_train_vectors = TfidfVectorizer_sumbission.fit_transform(twitter_df['lemmatize_word_joined'])


In [67]:
# Preprocessing
twitter_test = pd.read_csv(r'E:\Python\Datasets\Disaster_Tweets\test.csv')

twitter_test['cleaned_text'] = twitter_test['text'].apply(lambda x: x.lower())
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(contractions.fix)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(remove_URL)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(remove_html_tag)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(removal_non_ascii)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(remove_punc)
twitter_test['tokenized'] = twitter_test['cleaned_text'].apply(word_tokenize)
twitter_test['stopwords_removed'] = twitter_test['tokenized'].apply(lambda x: [word for word in x if word not in stop])
twitter_test['lemmatize_word'] = twitter_test['stopwords_removed'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
twitter_test['lemmatize_word_joined'] = twitter_test['lemmatize_word'].apply(lambda x: ' '.join(x))


In [65]:
submission_test_vectors = count_vectorizer_sumbission.transform(twitter_test['lemmatize_word_joined'])

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [68]:
submission_test_vectors = TfidfVectorizer_sumbission.transform(twitter_test['lemmatize_word_joined'])

In [69]:
clf_best = linear_model.RidgeClassifier(alpha = 1.5)
clf_best.fit(submission_train_vectors, twitter_df['target'])
submission_prediction = clf_best.predict(submission_test_vectors)

In [70]:
submission_prediction

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [71]:
twitter_test.head()

Unnamed: 0,id,keyword,location,text,cleaned_text,tokenized,stopwords_removed,lemmatize_word,lemmatize_word_joined
0,0,,,Just happened a terrible car crash,just happened a terrible car crash,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[happened, terrible, car, crash]",happened terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",heard about earthquake is different cities sta...,"[heard, about, earthquake, is, different, citi...","[heard, earthquake, different, cities, stay, s...","[heard, earthquake, different, city, stay, saf...",heard earthquake different city stay safe ever...
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[forest, fire, spot, pond, goose, fleeing, acr...",forest fire spot pond goose fleeing across str...
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires,"[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfire]",apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 in china and taiwan,"[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[typhoon, soudelor, kill, 28, china, taiwan]",typhoon soudelor kill 28 china taiwan


In [72]:
submission_df = twitter_test[['id']]

In [73]:
submission_df.head()

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11


In [74]:
submission_df = twitter_test[['id']].copy()
submission_df['target'] = submission_prediction

In [75]:
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [76]:
submission_df.to_csv(r'E:\Python\Datasets\Disaster_Tweets\submission.csv', index=False)

## Making submission Word2vec

In [233]:
twitter_test = pd.read_csv(r'E:\Python\Datasets\Disaster_Tweets\test.csv')


In [234]:
twitter_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [237]:
twitter_test['cleaned_text'] = twitter_test['text'].apply(lambda x: x.lower())
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(contractions.fix)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(remove_URL)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(remove_html_tag)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(removal_non_ascii)
twitter_test['cleaned_text'] = twitter_test['cleaned_text'].apply(remove_punc)
twitter_test['tokenized'] = twitter_test['cleaned_text'].apply(word_tokenize)
twitter_test['stopwords_removed'] = twitter_test['tokenized'].apply(lambda x: [word for word in x if word not in stop])
twitter_test['lemmatize_word'] = twitter_test['stopwords_removed'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
twitter_test['word2vec_average'] = twitter_test['lemmatize_word'].apply(lambda x: get_average_vec(x, word2vec_model, generate_missing=True))

In [242]:
ridge_clf = linear_model.RidgeClassifier(alpha = 45)
ridge_clf.fit(twitter_df['word2vec_averaged_vector'].tolist(), twitter_df['target'])
prediction = ridge_clf.predict(twitter_test['word2vec_average'].tolist())

In [245]:
submission_df = twitter_test[['id']].copy()
submission_df['target'] = prediction

In [246]:
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [247]:
submission_df.to_csv(r'E:\Python\Datasets\Disaster_Tweets\submission.csv', index = False)