In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [8]:
df=pd.read_csv('distress_non_distress_phrases.csv')

In [3]:
print(df)

                                      Phrase      Category  Label
0                                   Help me!      Distress      1
1                    Please call the police!      Distress      1
2                             I'm in danger!      Distress      1
3                         I don't feel safe!      Distress      1
4                   Someone is following me!      Distress      1
..                                       ...           ...    ...
195                       I love this place!  Non-Distress      0
196  I am planning to watch a movie tonight.  Non-Distress      0
197        I am making plans for the summer.  Non-Distress      0
198          I am catching up on some sleep.  Non-Distress      0
199    I am thinking about painting my room.  Non-Distress      0

[200 rows x 3 columns]


In [14]:
data=df.where((pd.notnull(df)),'')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Phrase    200 non-null    object
 1   Category  200 non-null    object
 2   Label     200 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 4.8+ KB


In [6]:
data.shape

(200, 3)

In [15]:
X=data['Phrase']
Y=data['Label']

In [8]:
print(X)

0                                     Help me!
1                      Please call the police!
2                               I'm in danger!
3                           I don't feel safe!
4                     Someone is following me!
                        ...                   
195                         I love this place!
196    I am planning to watch a movie tonight.
197          I am making plans for the summer.
198            I am catching up on some sleep.
199      I am thinking about painting my room.
Name: Phrase, Length: 200, dtype: object


In [9]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
195    0
196    0
197    0
198    0
199    0
Name: Label, Length: 200, dtype: int64


In [16]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=3)

In [11]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(200,)
(160,)
(40,)


In [12]:
print(Y)
print(Y_train.shape)
print(Y_test.shape)

0      1
1      1
2      1
3      1
4      1
      ..
195    0
196    0
197    0
198    0
199    0
Name: Label, Length: 200, dtype: int64
(160,)
(40,)


In [17]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [14]:
print(X_train)

156        This coffee is really good!
115          I am reading a good book.
155    I am enjoying my time off work.
15     Can you call the police for me?
61       I don't feel okay around him!
                    ...               
0                             Help me!
184    I am working on a side project.
131     I enjoy listening to podcasts.
152          I'll pick you up at 6 PM.
106         The weather is nice today.
Name: Phrase, Length: 160, dtype: object


In [15]:
print(X_train_features)

  (0, 25)	0.6002216901994932
  (0, 129)	0.5655678220222857
  (0, 61)	0.5655678220222857
  (1, 61)	0.5544784924263761
  (1, 128)	0.588452887424547
  (1, 14)	0.588452887424547
  (2, 41)	0.630123932210501
  (2, 169)	0.5490645818369528
  (2, 195)	0.5490645818369528
  (3, 124)	1.0
  (4, 39)	0.516067172725278
  (4, 48)	0.5008786362770564
  (4, 108)	0.6948347033335154
  (5, 121)	0.5719207902171602
  (5, 138)	0.6184597429735716
  (5, 174)	0.5389008777488146
  (6, 169)	0.4494132061771336
  (6, 37)	0.515760852241162
  (6, 158)	0.515760852241162
  (6, 23)	0.515760852241162
  (7, 94)	0.5617119774554691
  (7, 56)	0.5617119774554691
  (7, 43)	0.6074202077360723
  (8, 175)	0.6147957817775813
  (8, 42)	0.7886863424128079
  :	:
  (149, 33)	0.6148684090739794
  (150, 159)	0.7071067811865476
  (150, 44)	0.7071067811865476
  (151, 187)	0.6789431526595925
  (151, 176)	0.7341908440294345
  (152, 71)	0.6081027981994317
  (152, 127)	0.7938582913984216
  (153, 170)	0.4680258568724928
  (153, 92)	0.483642229940

In [18]:
model=LogisticRegression()

In [19]:
model.fit(X_train_features,Y_train)

In [20]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [19]:
print('Acc on training data : ', accuracy_on_training_data)

Acc on training data :  0.99375


In [20]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [21]:
print('Acc on training data : ', accuracy_on_training_data)

Acc on training data :  0.99375


In [26]:
input_your_mail = ["hello"]

input_data_features = feature_extraction.transform(input_your_mail)

prediction = model.predict(input_data_features)

print(prediction)

if(prediction[0] == 1):
    print('Ham mail')
else:
    print('Spam mail')


[0]
Spam mail
