In [None]:
import numpy as np
import pandas as pd

In [None]:
train_data = pd.read_csv('/content/twitter_training.csv')

In [None]:
train_data.columns = ["ID", "Game", "Sentiment", "Comment"]

In [None]:
train_data

Unnamed: 0,ID,Game,Sentiment,Comment
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [None]:
train_data.head()

Unnamed: 0,ID,Game,Sentiment,Comment
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [None]:
train_data.shape

(74681, 4)

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         74681 non-null  int64 
 1   Game       74681 non-null  object
 2   Sentiment  74681 non-null  object
 3   Comment    73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [None]:
train_data.isnull().sum()

Unnamed: 0,0
ID,0
Game,0
Sentiment,0
Comment,686


In [None]:
rows_with_missing_values = train_data[train_data.isnull().any(axis=1)]

In [None]:
rows_with_missing_values

Unnamed: 0,ID,Game,Sentiment,Comment
60,2411,Borderlands,Neutral,
552,2496,Borderlands,Neutral,
588,2503,Borderlands,Neutral,
744,2532,Borderlands,Positive,
1104,2595,Borderlands,Positive,
...,...,...,...,...
73971,9073,Nvidia,Positive,
73972,9073,Nvidia,Positive,
74420,9154,Nvidia,Positive,
74421,9154,Nvidia,Positive,


In [None]:
train_data['Sentiment'].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [None]:
train_data = train_data.dropna(subset=["Comment"])

In [None]:
train_data.shape

(73995, 4)

In [None]:
train_data.isnull().sum()

Unnamed: 0,0
ID,0
Game,0
Sentiment,0
Comment,0


In [None]:
train_data.drop(columns=['ID','Game'])

Unnamed: 0,Sentiment,Comment
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [None]:
from sklearn.preprocessing import LabelEncoder

le_sentiment = LabelEncoder()

train_data['Sentiment']= train_data.loc[:, 'Sentiment'] = le_sentiment.fit_transform(train_data['Sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Sentiment']= train_data.loc[:, 'Sentiment'] = le_sentiment.fit_transform(train_data['Sentiment'])


In [None]:
train_data['Sentiment'].unique()

array([3, 2, 1, 0])

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_data.loc[: , 'Comment'] = train_data['Comment'].apply(lambda x: x.replace('\/@<>!', ' '))

In [None]:
stemmer = PorterStemmer()
corpus = []

for i in range(len(train_data)):
  text = train_data['Comment'].iloc[i].lower()
  text = text.translate(str.maketrans('','' , string.punctuation)).split()
  text = [stemmer.stem(word) for word in text if word not in stop_words]
  text = ' '.join(text)
  corpus.append(text)


def remove_pattern(txt, pattern):
  r = re.findall(pattern , txt)
  for i in r:
    txt = re.sub(i,"", txt)
  return txt


train_data.loc[: ,'Comment'] = np.vectorize(remove_pattern)(train_data['Comment'],"@[\w]*")
train_data.loc[: ,'Comment'] = train_data['Comment'].str.replace("(^a-zA-Z#)", " ")

In [None]:
train_data['Comment'].head()

Unnamed: 0,Comment
0,I am coming to the borders and I will kill you...
1,im getting on borderlands and i will kill you ...
2,im coming on borderlands and i will murder you...
3,im getting on borderlands 2 and i will murder ...
4,im getting into borderlands and i can murder y...


In [None]:
vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(corpus)

# Convert the TF-IDF matrix to a dense format for better readability
dense_tfidf_matrix = tfidf_matrix.todense()

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

In [None]:
print(tfidf_matrix)

  (0, 16588)	0.4935921263717654
  (0, 5350)	0.7489447769623363
  (0, 7113)	0.44209561617893856
  (1, 5353)	0.5409451870265484
  (1, 12232)	0.40545561612822834
  (1, 15038)	0.44833417603329767
  (1, 16588)	0.5847910007715882
  (2, 19401)	0.6918798879860053
  (2, 5353)	0.4456827302092061
  (2, 15038)	0.3693808622624985
  (2, 7113)	0.43154015246319066
  (3, 19401)	0.719240657478231
  (3, 5353)	0.46330749812002964
  (3, 12232)	0.3472636999317
  (3, 15038)	0.383988230973017
  (4, 19401)	0.719240657478231
  (4, 5353)	0.46330749812002964
  (4, 12232)	0.3472636999317
  (4, 15038)	0.383988230973017
  (5, 22407)	0.3216187405867469
  (5, 9967)	0.16244253261048408
  (5, 17932)	0.15725419337395113
  (5, 7630)	0.3010257240265198
  (5, 31186)	0.27532713207621295
  (5, 15041)	0.22397763241601787
  :	:
  (73992, 32890)	0.19524920296302842
  (73993, 21421)	0.4226811982132595
  (73993, 20371)	0.20246610199981266
  (73993, 9307)	0.28676279397356685
  (73993, 32149)	0.2733760565372244
  (73993, 6155)	0.273

In [None]:
print(dense_tfidf_matrix)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
print(feature_names)

['00' '000' '00011' ... 'การออกอากาศของฉ' 'นจาก' 'ℐℓ٥']


In [None]:
train_data.head()

Unnamed: 0,ID,Game,Sentiment,Comment
0,2401,Borderlands,3,I am coming to the borders and I will kill you...
1,2401,Borderlands,3,im getting on borderlands and i will kill you ...
2,2401,Borderlands,3,im coming on borderlands and i will murder you...
3,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,3,im getting into borderlands and i can murder y...


In [None]:
train_data.head()

Unnamed: 0,ID,Game,Sentiment,Comment
0,2401,Borderlands,3,I am coming to the borders and I will kill you...
1,2401,Borderlands,3,im getting on borderlands and i will kill you ...
2,2401,Borderlands,3,im coming on borderlands and i will murder you...
3,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,3,im getting into borderlands and i can murder y...


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

X = train_data['Comment']
y = train_data['Sentiment']


X_tfidf = vectorizer.fit_transform(X)


scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X_tfidf)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=500, solver='saga')

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')

Accuracy: 0.8736401108182985




In [None]:
print('Classification Report:')
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.83      0.84      2624
           1       0.89      0.88      0.89      4463
           2       0.88      0.87      0.88      3589
           3       0.84      0.89      0.86      4123

    accuracy                           0.87     14799
   macro avg       0.87      0.87      0.87     14799
weighted avg       0.87      0.87      0.87     14799

