In [25]:
import numpy as np
import pandas as pd
import re
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [26]:
train = pd.read_csv("./text.csv").dropna()

In [27]:
def cleanText(text):
    # Loại bỏ các @mentions
    text = re.sub(r'@\w+', '', text)
    # Loại bỏ các ký hiệu không phải chữ cái, số hoặc khoảng trắng
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Loại bỏ chuỗi "RT"
    text = re.sub(r'\bRT\b', '', text)
    # Loại bỏ các liên kết hyper-link
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Loại bỏ dấu hai chấm theo sau bởi một hoặc nhiều khoảng trắng
    text = re.sub(r':[\s]+', '', text)
    # Loại bỏ khoảng trắng thừa
    text = text.strip()
    return text

In [28]:
train['text'] = train['text'].apply(cleanText)


In [29]:
train.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [30]:
#Copy the unprocessed train and test dataframes

#label_classes = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust','neutral']

X= train['text'] 
y = train['label']


In [31]:
tfid_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X = tfid_vectorizer.fit_transform(X)
X.shape

(416809, 361610)

In [32]:
from sklearn.decomposition import TruncatedSVD
svd_ngram_char = TruncatedSVD(n_components=1000, random_state=42)


X = svd_ngram_char.fit_transform(X)
X.shape

(416809, 1000)

In [33]:
# svd_ngram_char = PCA(n_components=300)
# X = svd_ngram_char.fit_transform(X)
# X.shape

In [34]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
knc =KNeighborsClassifier()

knc.fit(X_train, y_train)

#Form a prediction set
predictions_knc = knc.predict(X_val)

 # Print a classification report
print(classification_report(y_val, predictions_knc))



              precision    recall  f1-score   support

           0       0.95      0.94      0.94     24201
           1       0.90      0.95      0.92     28164
           2       0.86      0.77      0.81      6929
           3       0.91      0.90      0.91     11441
           4       0.87      0.87      0.87      9594
           5       0.84      0.70      0.76      3033

    accuracy                           0.91     83362
   macro avg       0.89      0.85      0.87     83362
weighted avg       0.91      0.91      0.90     83362



In [40]:
SVM =LinearSVC()

SVM.fit(X_train, y_train)

#Form a prediction set
predictions_SVM = SVM.predict(X_val)

 # Print a classification report
print(classification_report(y_val, predictions_SVM))



              precision    recall  f1-score   support

           0       0.95      0.94      0.94     24201
           1       0.90      0.95      0.92     28164
           2       0.86      0.77      0.81      6929
           3       0.91      0.90      0.91     11441
           4       0.87      0.87      0.87      9594
           5       0.84      0.70      0.76      3033

    accuracy                           0.91     83362
   macro avg       0.89      0.85      0.87     83362
weighted avg       0.91      0.91      0.90     83362



In [41]:
LSG =LogisticRegression()

LSG.fit(X_train, y_train)

#Form a prediction set
predictions_LSG = LSG.predict(X_val)

 # Print a classification report
print(classification_report(y_val, predictions_LSG))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.94      0.94      0.94     24201
           1       0.89      0.96      0.92     28164
           2       0.88      0.75      0.81      6929
           3       0.92      0.89      0.90     11441
           4       0.88      0.85      0.87      9594
           5       0.86      0.69      0.77      3033

    accuracy                           0.90     83362
   macro avg       0.89      0.85      0.87     83362
weighted avg       0.90      0.90      0.90     83362



In [37]:
label_class=["sadness","joy","love","anger","fear","surprise "]

def predict_text(model,input):
    input = tfid_vectorizer.transform([input])
    input = svd_ngram_char.transform(input)
    predictions = model.predict(input)
    print(label_class[predictions[0]])
    # for v,l in zip(predictions_knc[0],label_classes):
    #     if( v == 1.0):
    #         print(l)


In [50]:
input ="i do feel so fear"
predict_text(knc,input)

joy
