# 1. Import Libraries

In [1]:
import re, string, joblib, nltk, json, collections, pandas as pd
from sklearn import tree, metrics
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from textblob import Word 
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, accuracy_score,classification_report
from nltk.tokenize import RegexpTokenizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

print("Libraries imported....")

Libraries imported....


In [2]:
# ### Read csv file
Suicide = pd.read_csv("training_dataset.csv",encoding ="ISO-8859-1") 

# 2. Data Cleaning

Data Cleaning - Removing Null, Missing Values, Renaming Columns.

In [3]:
Suicide['Tweet']=Suicide['Tweet'].fillna("")                  #remove all the null value

In [4]:
# function to remove punctuation
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

# 3. Data Preprocessing

<ul><li>Lower-casing</li>
    <li>NLTK</li> 
    <li>Removing Stop Words</li>
    <li>Language Filtering</li>
    <li>Lemmetization</li></ul>

In [6]:
Suicide['lower_case']= Suicide['Tweet'].apply(lambda x: x.lower())   
Suicide['tweet_punct'] = Suicide['lower_case'].apply(lambda x: remove_punct(x))   
#Dataset['RT'] = Dataset['lower_case'].replace({"rt": ''}, regex=True)
tokenizer = RegexpTokenizer(r'\w+')
Suicide['Special_word'] = Suicide.apply(lambda row: tokenizer.tokenize(row['lower_case']), axis=1)    

freq = pd.Series(' '.join(Suicide['Tweet']).split()).value_counts()[-10:]                       
freq = list(freq.index)
Suicide['Contents'] = Suicide['Tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq)) 

stop = stopwords.words('english')
Suicide['stop_words'] = Suicide['Special_word'].apply(lambda x: [item for item in x if item not in stop])  

Suicide['stop_words'] = Suicide['stop_words'].astype('str')
Suicide['short_word'] = Suicide['stop_words'].str.findall('\w{3,}')         
Suicide['string'] = Suicide['stop_words'].replace({"'": '', ',': ''}, regex=True)
Suicide['string'] = Suicide['string'].str.findall('\w{3,}').str.join(' ') 
 
import nltk
nltk.download('stopwords')
words = set(nltk.corpus.words.words())
Suicide['NonEnglish'] = Suicide['string'].apply(lambda x: " ".join(x for x in x.split() if x in words))  

Suicide['tweet'] = Suicide['NonEnglish'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 4. Applying n-gram (1,3)

Splitting the data into Train-Test ratio of 67-33. Applying n-gram (1,3) to Count Vectorizer and Fit-Transsform using Tf-IDF.

In [90]:
x_train, x_test, y_train, y_test = train_test_split(Suicide["tweet"],Suicide["Suicide"], test_size = 0.33, random_state = 42)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer    
count_vect = CountVectorizer(ngram_range=(1, 3))               #set it to ngram (1,3)
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

print (x_train_tfidf.shape,x_test_tfidf.shape, y_train.shape, x_train.shape)

(7187, 285056) (3540, 285056) (7187,) (7187,)


# 5. Machine Learning Model

Using various Machine learning classifiers to Train, Test and Predict and Validate them.

# 5.1 GradientBoostingClassifier 

Running the GradientBoostingClassifier with the following parameters and capturing the performance metrics.

In [91]:
from sklearn.ensemble import GradientBoostingClassifier                       
model_1 = GradientBoostingClassifier(n_estimators=1000,
                                        max_features='auto', max_depth=4,
                                        random_state=1, verbose=1)

model_1.fit(x_train_tfidf, y_train)
y_pred1 = model_1.predict(x_test_tfidf)
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test, y_pred1))
print(classification_report(y_test, y_pred1))


# Cross Validation

scores_1 = cross_val_score(model_1, x_train_tfidf,y_train, cv=3)   #3 fold validation
print(accuracy_score(y_test,y_pred1))
print ("Cross-validated scores:", scores_1)

      Iter       Train Loss   Remaining Time 
         1           1.2764           16.31m
         2           1.2064           17.60m
         3           1.1492           17.29m
         4           1.1001           16.48m
         5           1.0586           15.98m
         6           1.0233           16.33m
         7           0.9923           16.78m
         8           0.9656           16.81m
         9           0.9377           16.44m
        10           0.9165           17.15m
        20           0.7589           16.58m
        30           0.6780           14.97m
        40           0.6360           13.78m
        50           0.6056           13.39m
        60           0.5815           13.19m
        70           0.5618           12.93m
        80           0.5445           12.66m
        90           0.5298           12.40m
       100           0.5164           12.10m
       200           0.4266           10.17m
       300           0.3721            9.05m
       40

# 5.2 AdaBoost with RF

AdaBoost with RandomForestClassifier

In [92]:
from sklearn.ensemble import AdaBoostClassifier
dt = RandomForestClassifier(n_estimators=40, max_depth=9, random_state=0)
model_2 = AdaBoostClassifier(base_estimator=dt, learning_rate=0.2, n_estimators=100)
model_2.fit(x_train_tfidf, y_train)                                                   
y_pred2 = model_2.predict(x_test_tfidf)
print(accuracy_score(y_test, y_pred2))
print(classification_report(y_test, y_pred2))

# Cross-Validation

scores_2 = cross_val_score(model_2, x_train_tfidf,y_train, cv=3)   #3 fold validation
print(accuracy_score(y_test,y_pred2))
print ("Cross-validated scores:", scores_2)

0.8624293785310735
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      2015
           1       0.95      0.71      0.82      1525

    accuracy                           0.86      3540
   macro avg       0.89      0.84      0.85      3540
weighted avg       0.88      0.86      0.86      3540

0.8624293785310735
Cross-validated scores: [0.87353923 0.85350584 0.86096033]


# 6. Applying n-gram (1,2)

Splitting the data into Train-Test ratio of 67-33. Applying n-gram(1,2) to Count Vectorizer and Fit-Transsform using Tf-IDF.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Suicide["tweet"],Suicide["Suicide"], test_size = 0.33, random_state = 42)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(ngram_range=(1, 2))
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

X_train_counts = count_vect.fit_transform(X_train)
X_train_tfidf = transformer.fit_transform(X_train_counts)

X_test_counts = count_vect.transform(X_test)
X_test_tfidf = transformer.transform(X_test_counts)

print (X_train_tfidf.shape,X_test_tfidf.shape, y_train.shape, x_train.shape)

# 7. Machine Learning Algorithm

# 7.1 Random Forest and Adaboost

RandomForestClassifier with AdaBoost.

In [94]:
dt_stump = RandomForestClassifier(n_estimators=40, max_depth=9, random_state=0)
model_a = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=0.1, n_estimators=100)
model_a.fit(X_train_tfidf, y_train)                                                   
y_preda = model_a.predict(X_test_tfidf)
print("Accuracy: ", accuracy_score(y_test, y_preda))
print("Report:", classification_report(y_test, y_preda))

### Cross Validation

scores_a = cross_val_score(model_a, X_train_tfidf,y_train, cv=3)   #3 fold validation
print("Validation Accuracy: ", accuracy_score(y_test,y_preda))
print ("Cross-validated scores:", scores_a, "\n")

Accuracy:  0.8559322033898306
Report:               precision    recall  f1-score   support

           0       0.81      0.98      0.89      2015
           1       0.96      0.69      0.81      1525

    accuracy                           0.86      3540
   macro avg       0.89      0.84      0.85      3540
weighted avg       0.87      0.86      0.85      3540

Validation Accuracy:  0.8559322033898306
Cross-validated scores: [0.86936561 0.8484975  0.85219207] 



# 7.2 Gradient Boosting

Running the GradientBoostingClassifier with the following parameters and capturing the performance metrics.

In [95]:
from sklearn.ensemble import GradientBoostingClassifier                       
model_b = GradientBoostingClassifier(n_estimators=5000,
                                        max_features='auto', max_depth=5,
                                        random_state=1, verbose=1)

model_b.fit(X_train_tfidf, y_train)
y_predb = model_a.predict(X_test_tfidf)
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test, y_predb))
print(classification_report(y_test, y_predb))

#3 fold validation
scores_b = cross_val_score(model_b, X_train_tfidf,y_train, cv=3)
print(accuracy_score(y_test,y_predb))
print ("Cross-validated scores:", scores_b)

      Iter       Train Loss   Remaining Time 
         1           1.2702           33.54m
         2           1.1957           33.04m
         3           1.1343           34.09m
         4           1.0821           33.82m
         5           1.0376           33.46m
         6           0.9997           33.54m
         7           0.9666           33.31m
         8           0.9376           33.32m
         9           0.9074           33.08m
        10           0.8834           33.14m
        20           0.7196           32.82m
        30           0.6349           32.48m
        40           0.5852           31.80m
        50           0.5559           30.60m
        60           0.5332           29.84m
        70           0.5139           29.68m
        80           0.4944           29.44m
        90           0.4805           28.96m
       100           0.4674           28.58m
       200           0.3753           25.90m
       300           0.3248           24.27m
       40

# 7.3 AdaBoost with DT

AdaBoost with DecisionTreeClassifier

In [10]:
from sklearn.tree import DecisionTreeClassifier                               
from sklearn.ensemble import AdaBoostClassifier
dt_stump = tree.DecisionTreeClassifier()
model_i= AdaBoostClassifier(base_estimator=dt_stump, learning_rate=0.1, n_estimators=300)
model_i.fit(X_train_tfidf, y_train)                                                   
y_predi = model_i.predict(X_test_tfidf)
print(accuracy_score(y_test, y_predi))
print(classification_report(y_test, y_predi))

scores_i = cross_val_score(model_i, X_train_tfidf,y_train, cv=3)   #3 fold validation
print(accuracy_score(y_test,y_predi))
print ("Cross-validated scores:", scores_i)

0.8062146892655367
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      2015
           1       0.80      0.73      0.77      1525

    accuracy                           0.81      3540
   macro avg       0.81      0.80      0.80      3540
weighted avg       0.81      0.81      0.80      3540

0.8062146892655367
Cross-validated scores: [0.86310518 0.83639399 0.8434238 ]


# 8. SVM Model

#### Using SVM rbf kernel and pipeline to impliment steps in sequence.

In [101]:
# split dataset into train-test set
x_train, x_test,y_train, y_test = train_test_split(Suicide['tweet'].values.astype('str'), Suicide['Suicide'].values.astype('str'), test_size = 0.40,
                                                   random_state = 400)
# Steps implementation in series
# Use pipeline to carry out steps in sequence with a single object
# SVM's rbf kernel gives highest accuracy.

clf_model = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel = 'rbf'))])

# Training

clf_model.fit(x_train, y_train)

# predict class form test data 
predict = clf_model.predict(x_test)

# print accuracy and classification report
accuracy = (accuracy_score(y_test, predict))
print("Accuracy: ", accuracy)
print("Report: ", classification_report(y_test, predict), "\n")

Accuracy:  0.8981589373106502
Report:                precision    recall  f1-score   support

           0       0.88      0.95      0.92      2473
           1       0.93      0.82      0.87      1818

    accuracy                           0.90      4291
   macro avg       0.90      0.89      0.89      4291
weighted avg       0.90      0.90      0.90      4291
 



# 10. Save best model

using joblib to save model

In [102]:
save_model = 'svm_model.sav'
joblib.dump(clf_model, save_model) # to save model
print("Model saved to disk...")

Model saved to disk...
