In [1]:
import numpy as np 
import pandas as pd 
import re
import nltk
from nltk.corpus import stopwords
import nltk as nlp
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [2]:
df = pd.read_csv("Sheet_1.csv")
df

Unnamed: 0,response_id,class,response_text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,response_1,not_flagged,I try and avoid this sort of conflict,,,,,
1,response_2,flagged,Had a friend open up to me about his mental ad...,,,,,
2,response_3,flagged,I saved a girl from suicide once. She was goin...,,,,,
3,response_4,not_flagged,i cant think of one really...i think i may hav...,,,,,
4,response_5,not_flagged,Only really one friend who doesn't fit into th...,,,,,
...,...,...,...,...,...,...,...,...
75,response_76,not_flagged,"Now that I've been through it, although i'm no...",,,,,
76,response_77,flagged,when my best friends mom past away from od'ing...,,,,,
77,response_78,not_flagged,As a camp counselor I provide stability in kid...,,,,,
78,response_79,flagged,My now girlfriend used to have serious addicti...,,,,,


In [3]:
data = pd.concat([df["class"],df["response_text"]], axis = 1)
# Sheet_1.drop(["response_id","Unnamed: 3","Unnamed: 4","Unnamed: 5","Unnamed: 6","Unnamed: 7"], axis = 1, inplace = True)

data.dropna(axis = 0, inplace=True)
data

Unnamed: 0,class,response_text
0,not_flagged,I try and avoid this sort of conflict
1,flagged,Had a friend open up to me about his mental ad...
2,flagged,I saved a girl from suicide once. She was goin...
3,not_flagged,i cant think of one really...i think i may hav...
4,not_flagged,Only really one friend who doesn't fit into th...
...,...,...
75,not_flagged,"Now that I've been through it, although i'm no..."
76,flagged,when my best friends mom past away from od'ing...
77,not_flagged,As a camp counselor I provide stability in kid...
78,flagged,My now girlfriend used to have serious addicti...


In [4]:
# not_flagged = 0
# flagged = 1

data["class"] = [1 if clas  == "flagged" 
                 else 0 for clas in df["class"]]
data

Unnamed: 0,class,response_text
0,0,I try and avoid this sort of conflict
1,1,Had a friend open up to me about his mental ad...
2,1,I saved a girl from suicide once. She was goin...
3,0,i cant think of one really...i think i may hav...
4,0,Only really one friend who doesn't fit into th...
...,...,...
75,0,"Now that I've been through it, although i'm no..."
76,1,when my best friends mom past away from od'ing...
77,0,As a camp counselor I provide stability in kid...
78,1,My now girlfriend used to have serious addicti...


In [5]:
# information of data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   class          80 non-null     int64 
 1   response_text  80 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [9]:
# Dataset columns names

data.columns

Index(['class', 'response_text'], dtype='object')

# Natural Language Process (NLP) 

Natural language processing (NLP) is a branch of artificial intelligence (AI) that enables computers to comprehend, generate, and manipulate human language.

In [10]:
data.head(10)

Unnamed: 0,class,response_text
0,0,I try and avoid this sort of conflict
1,1,Had a friend open up to me about his mental ad...
2,1,I saved a girl from suicide once. She was goin...
3,0,i cant think of one really...i think i may hav...
4,0,Only really one friend who doesn't fit into th...
5,0,a couple of years ago my friends was going to ...
6,1,Roommate when he was going through death and l...
7,1,i've had a couple of friends (you could say mo...
8,0,Listened to someone talk about relationship tr...
9,1,I will always listen. I comforted my sister wh...


 Clean the non-letter symbols with the Re library. (/, *, -, ',")

In [11]:
data.response_text[46]

'I open myself to any friend in need of emotional support.'

In [12]:
first_text = data.response_text[46]
text = re.sub("[^a-zA-Z]"," ",first_text)
text = text.lower() 
print(text)

i open myself to any friend in need of emotional support 


Irrelevant Words (Stopwords)

In [13]:
text = nltk.word_tokenize(text)
text = [ word for word in text if not word in set(stopwords.words("english"))]
print(text)

['open', 'friend', 'need', 'emotional', 'support']


In [14]:
lemma = nlp.WordNetLemmatizer()
text = [lemma.lemmatize(word) for word in text]
text = " ".join(text)
text

'open friend need emotional support'

In [15]:
description_list = []
for description in data.response_text:
       
    description = re.sub("[^a-zA-Z]"," ",description)
    description = description.lower() 
    
    description = nltk.word_tokenize(description)
    description = [ word for word in description if not word in set(stopwords.words("english"))]
    
    lemmatizer = WordNetLemmatizer()
    description = (lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(word, "n"),pos = "v"),pos="a") for word in description)
    
    description = " ".join(description)
    description_list.append(description)
    
description_list

['try avoid sort conflict',
 'friend open mental addiction weed take life make depress',
 'save girl suicide go swallow bunch pill talk calm love way',
 'cant think one really think may indirectly',
 'really one friend fit category therapist call spiral anyway pretty much call time frustrate something boyfriend ask logical would fight would call crazy ask ok say please say hand remote',
 'couple year ago friend go switch school low self esteem help overcome shit',
 'roommate go death loss gf anything get bedroom',
 'couple friend could say friend quite severe depression emotional problem help eventually relationship start suffer result personal problem',
 'listen someone talk relationship trouble offer advice personal experience',
 'always listen comfort sister lose virgity night walk boyfriend cut parent find throw house part simply bring supportive focus',
 'take week work pack car pick friend verge lose go camp surf week parent big part problem away others physical activity every da

In [16]:
max_features = 250
count_vectorizer = CountVectorizer(max_features = max_features)
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()
print("Top {} Most used words: {}".format(max_features,count_vectorizer.get_feature_names()))


Top 250 Most used words: ['addiction', 'advice', 'alcoholic', 'almost', 'alone', 'also', 'always', 'answer', 'anxiety', 'anything', 'around', 'ask', 'away', 'back', 'bad', 'basically', 'best', 'big', 'bite', 'blow', 'blunt', 'boyfriend', 'bring', 'call', 'calm', 'camp', 'cant', 'care', 'change', 'clean', 'come', 'comfort', 'could', 'couple', 'day', 'deal', 'depress', 'depression', 'describe', 'desire', 'diagnose', 'dont', 'dump', 'either', 'else', 'emotional', 'end', 'esteem', 'even', 'every', 'everything', 'ex', 'experience', 'express', 'face', 'feel', 'felt', 'find', 'friend', 'get', 'gf', 'girl', 'girlfriend', 'give', 'go', 'good', 'grade', 'guess', 'guy', 'happen', 'hard', 'head', 'hear', 'help', 'helpful', 'high', 'hold', 'honest', 'hospital', 'important', 'internet', 'irl', 'issue', 'keep', 'kid', 'kill', 'know', 'last', 'let', 'life', 'like', 'listen', 'little', 'look', 'lose', 'lot', 'low', 'make', 'many', 'may', 'mental', 'mom', 'month', 'much', 'need', 'never', 'night', 'offe



Naive Bayes Classification 

In [17]:
x = sparce_matrix
y = data.iloc[:,0].values

In [18]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
y

array([0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [20]:
# Train test split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 46)

In [21]:
gnb = GaussianNB()

gnb.fit(x_train,y_train)

y_pred = gnb.predict(x_test)

print('The accuracy of the Random Forest is',metrics.accuracy_score(y_pred,y_test))

The accuracy of the Random Forest is 0.8125
