<a href="https://colab.research.google.com/github/BrianGisemba/MENTAL-HEALTH-TWEETS-CLASSIFICATION/blob/data_preprocessing/Data_Pre_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load the libraries needed 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#reading the dataset
df=pd.read_csv("/content/MentalHealth (2).csv")
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,user,tweet,location,description,friends_count,followers_count,statuses_count,created_at,retweet_count,hashtags,disorder
0,0,0,TheKenyanPost,SHOCK as a young man climbs on top of a hospit...,"Nairobi, Kenya","News, Politics, Entertainment, Gossip, Feature...",434,52951,356584,2021-08-17 10:33:55,0,[],depression
1,1,1,Goodguy_254,People tend to hide depression under drinking ...,"Nairobi, Kenya",poet\n writerüßæ\nsoftware developerüñ•Ô∏è\n program...,2074,2257,1284,2021-08-17 10:20:30,0,[],depression
2,2,2,AlfredLete,@Lily_nganga Depression,Nairobi,"do what is right,not what is easy",8515,13075,37974,2021-08-17 09:32:49,0,[],depression
3,3,3,GeraldNgaoPk7,Itumbi has subjected bloggers in to depression...,"Nairobi, Kenya",Pan-Africanist||NeoMarxistRevolutionary||Inter...,21503,20437,164694,2021-08-17 08:56:26,2,"[{'text': 'ItumbiGhostWorkers', 'indices': [77...",depression
4,4,4,swyma304,Mental health isn‚Äôt just anxiety &amp; depress...,"Nairobi, Kenya",Psychiatry Resident. Certified Cognitive Behav...,93,206,594,2021-08-17 08:18:20,1,[],depression


#Data Cleaning

Validity

In [None]:
# Droping all irrelevant collumns
droplist = ['Unnamed: 0','Unnamed: 0.1','description','friends_count', 'user',
       'followers_count', 'statuses_count', 'retweet_count',
       'hashtags']
df.drop(droplist, axis=1, inplace=True)
df.columns

Index(['tweet', 'location', 'created_at', 'disorder'], dtype='object')

Consistency

In [None]:
#we will look for the number of rows with duplicate records
print('Number of rows with duplicated values',df.duplicated().sum())

Number of rows with duplicated values 0


Completness

In [None]:
#Checking for null values
df.isnull().sum()

tweet         0
location      3
created_at    0
disorder      0
dtype: int64

Uniformity

In [None]:
df.columns=df.columns.str.strip().str.lower().str.replace(' ','_').str.replace('(','').str.replace(')','')
df.columns

Index(['tweet', 'location', 'created_at', 'disorder'], dtype='object')

In [None]:
# Checking the data types of the columns
df.dtypes

tweet         object
location      object
created_at    object
disorder      object
dtype: object

In [None]:
# Changing the created_at column into time and date
df.created_at = pd.to_datetime(df.created_at)
df.dtypes

tweet                 object
location              object
created_at    datetime64[ns]
disorder              object
dtype: object

#Data Preprocessing

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

 
# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [None]:
def remove_stopwords(string):
    return lemmatizer(stopword(preprocess(string)))
df['clean_tweet'] = df['tweet'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,location,created_at,disorder,clean_tweet
0,SHOCK as a young man climbs on top of a hospit...,"Nairobi, Kenya",2021-08-17 10:33:55,depression,shock young man climb top hospital building ka...
1,People tend to hide depression under drinking ...,"Nairobi, Kenya",2021-08-17 10:20:30,depression,people tend hide depression drinking drug take...
2,@Lily_nganga Depression,Nairobi,2021-08-17 09:32:49,depression,lily nganga depression
3,Itumbi has subjected bloggers in to depression...,"Nairobi, Kenya",2021-08-17 08:56:26,depression,itumbi subject blogger depression ought behind...
4,Mental health isn‚Äôt just anxiety &amp; depress...,"Nairobi, Kenya",2021-08-17 08:18:20,depression,mental health isnt anxiety amp depression nag ...


In [None]:
df1 = df.copy(deep=True)
df1.head()

Unnamed: 0,tweet,location,created_at,disorder,clean_tweet
0,SHOCK as a young man climbs on top of a hospit...,"Nairobi, Kenya",2021-08-17 10:33:55,depression,shock young man climb top hospital building ka...
1,People tend to hide depression under drinking ...,"Nairobi, Kenya",2021-08-17 10:20:30,depression,people tend hide depression drinking drug take...
2,@Lily_nganga Depression,Nairobi,2021-08-17 09:32:49,depression,lily nganga depression
3,Itumbi has subjected bloggers in to depression...,"Nairobi, Kenya",2021-08-17 08:56:26,depression,itumbi subject blogger depression ought behind...
4,Mental health isn‚Äôt just anxiety &amp; depress...,"Nairobi, Kenya",2021-08-17 08:18:20,depression,mental health isnt anxiety amp depression nag ...


In [None]:
import random

random.seed(100)

df_test = df1.sample(100, random_state = 42)
df_test

Unnamed: 0,tweet,location,created_at,disorder,clean_tweet
408,Your feelings about climate are justified -- i...,"Nairobi, Kenya",2021-08-15 17:05:00,anxiety,feeling climate justified extremely worried ca...
97,"Dear HR and recruiters, \nPlease stop worrying...","Nairobi, Kenya",2021-08-14 16:34:30,depression,dear hr recruiter please stop worry font size ...
424,You are released from anxiety When you know ...,"Nairobi, Kenya",2021-08-15 07:52:00,anxiety,release anxiety know however little may seem t...
584,@Note_OKay Thank you &amp; you are welcome. If...,"Nairobi, Kenya",2021-08-10 10:03:09,anxiety,note okay thank amp welcome anxiety try osteoc...
603,Anxiety does not empty tomorrow of its sorrows...,"Nakuru, Kenya",2021-08-09 19:04:06,anxiety,anxiety empty tomorrow sorrow empty today stre...
...,...,...,...,...,...
7,@HisiaPsychology @Kinyah_Y @Kabiru_Nancy @kani...,"Nairobi, Kenya",2021-08-17 07:50:49,depression,hisiapsychology kinyah kabiru nancy kaniu ndun...
369,People with Anxiety understand why ghosting is...,"Nairobi, Kenya",2021-08-16 19:42:18,anxiety,people anxiety understand ghost mental abuse
23,"Your profile says you have depression, so we d...","Nairobi, Kenya",2021-08-16 22:15:06,depression,profile say depression blame bbnaija http co a...
404,@BecomeAManAgain Public speaking anxiety. How ...,"Nairobi, Kenya",2021-08-15 18:59:18,anxiety,becomeamanagain public speaking anxiety deal


In [None]:
drop_idx = df_test.index


In [None]:
df1.shape

(756, 5)

In [None]:
# create Word2vec model
#here words_f should be a list containing words from each document. say 1st row of the list is words from the 1st document/sentence
#length of words_f is number of documents/sentences in your dataset
df1['clean_tweet_tok']=[nltk.word_tokenize(i) for i in df1['clean_tweet']] #convert preprocessed sentence to tokenized sentence
model = Word2Vec(df1['clean_tweet_tok'],min_count=1)  #min_count=1 means word should be present at least across all documents,
#if min_count=2 means if the word is present less than 2 times across all the documents then we shouldn't consider it


w2v = dict(zip(model.wv.index2word, model.wv.syn0))  #combination of word and its vector

#for converting sentence to vectors/numbers from word vectors result by Word2Vec
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

  if __name__ == '__main__':
