In [27]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim
from nltk.stem import WordNetLemmatizer

from gensim.parsing.preprocessing import remove_stopwords
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support

from nltk.util import ngrams

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Piotr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Piotr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("training.csv") # header = None names=["idk", "id", "date", "query", "nick", "content"])
# data.drop(["idk", "query", "nick"], axis=1, inplace=True)

In [3]:
data.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [5]:
data.tail()

Unnamed: 0,text,label
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,1
15998,i feel like this was such a rude comment and i...,3
15999,i know a lot but i feel so stupid because i ca...,0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16000 non-null  object
 1   label   16000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 250.1+ KB


In [7]:
lemmatizer = WordNetLemmatizer()

custom_stopwords = ["http", "https", "www", "com", "tinyurl"]

# credit to https://pytutorial.com/check-strig-url
url_pattern = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

In [8]:
def process_and_verify(line, custom_stopwords, url_pattern, lemmatizer):
    line = line.lower()
    line = remove_stopwords(line)
    res = []
    for word in str(line).split(' '):
        if url_pattern.match(word) is None:
            res.append(word)
            
    res = [lemmatizer.lemmatize(re.sub("[^a-zA-Z@' $]", " ", word)) for word in res]
    
    temp = []
    for word in res:
        for part in word.split(' '):
            part = lemmatizer.lemmatize(re.sub("[^a-zA-Z$]", "", part))
            if 
            # eliminate nicknames and words that are empty/of length 1
            if len(part) > 1 and word[0] != '@' and ' ' not in part and part not in custom_stopwords:
                temp.append(part)
    
    res = (' ').join(temp)
    res = remove_stopwords(res)
    return res

In [9]:
Xdata = np.asarray([process_and_verify(line, custom_stopwords, url_pattern, lemmatizer) for line in data['text']])

In [10]:
Xdata

array(['didnt feel humiliated',
       'feeling hopeless damned hopeful care awake',
       'im grabbing minute post feel greedy wrong', ...,
       'feel strong good overall', 'feel like rude comment im glad',
       'know lot feel stupid portray'], dtype='<U225')

In [12]:
vectorizer = TfidfVectorizer()

Xdata_encoded = np.asarray(vectorizer.fit_transform(Xdata).todense())

In [21]:
Xdata_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(Xdata_encoded, data['label'], test_size=0.20)

classifier = LogisticRegression(solver='liblinear')
classifier.fit(X_train, y_train)

y_res = classifier.predict(X_test)
precision_recall_fscore_support(y_test, y_res)


(array([0.8747495 , 0.76671851, 0.90797546, 0.90607735, 0.88023952,
        0.9122807 ]),
 array([0.92478814, 0.95450145, 0.55639098, 0.75750577, 0.72235872,
        0.44444444]),
 array([0.89907312, 0.85036654, 0.68997669, 0.82515723, 0.79352227,
        0.59770115]),
 array([ 944, 1033,  266,  433,  407,  117], dtype=int64))