In [4]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [8]:
# Load dataset
data = pd.read_csv('data7.tsv', sep='\t')
data

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [9]:
# Display first few rows to understand the structure
data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [10]:
# Get dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156058 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [11]:
# Count occurrences of each unique sentiment
data['Sentiment'].value_counts()

Sentiment
2    79582
3    32927
1    27273
4     9206
0     7072
Name: count, dtype: int64

In [12]:
# Tokenizer to remove symbols and numbers
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

In [14]:

# Text preprocessing using CountVectorizer
# Tokenizer to remove symbols and numbers
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')

In [16]:
# Ensure the column is present
print("Columns:", data.columns)

# Drop rows where Phrase is missing or not a string
data = data.dropna(subset=['Phrase']).copy()
data['Phrase'] = data['Phrase'].astype(str)

# Optional: verify there are no NaN values left
print("Remaining NaN values:", data['Phrase'].isna().sum())

Columns: Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')
Remaining NaN values: 0


In [17]:
# Text preprocessing using CountVectorizer
# Convert text data into numerical feature vectors
cv = CountVectorizer(lowercase=True, stop_words='english', tokenizer=tokenizer.tokenize)
text_counts = cv.fit_transform(data['Phrase'])
print(text_counts)



<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 598944 stored elements and shape (156058, 14988)>
  Coords	Values
  (0, 11671)	1
  (0, 4517)	1
  (0, 3444)	1
  (0, 294)	1
  (0, 5735)	2
  (0, 5751)	1
  (0, 5512)	1
  (0, 9065)	1
  (0, 593)	1
  (0, 584)	1
  (0, 12673)	1
  (1, 11671)	1
  (1, 4517)	1
  (1, 3444)	1
  (1, 294)	1
  (1, 5735)	1
  (1, 5751)	1
  (2, 11671)	1
  (4, 11671)	1
  (5, 4517)	1
  (5, 3444)	1
  (5, 294)	1
  (5, 5735)	1
  (5, 5751)	1
  (7, 4517)	1
  :	:
  (156048, 11305)	1
  (156048, 9054)	1
  (156049, 11305)	1
  (156049, 9054)	1
  (156050, 11305)	1
  (156051, 11281)	1
  (156051, 1281)	1
  (156051, 5252)	1
  (156051, 6156)	1
  (156051, 1006)	1
  (156051, 2271)	1
  (156052, 11281)	1
  (156052, 5252)	1
  (156052, 6156)	1
  (156052, 1006)	1
  (156052, 2271)	1
  (156053, 11281)	1
  (156053, 6156)	1
  (156054, 5252)	1
  (156054, 1006)	1
  (156054, 2271)	1
  (156055, 1006)	1
  (156055, 2271)	1
  (156056, 1006)	1
  (156057, 2271)	1


In [18]:
# Split dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(text_counts, data['Sentiment'], test_size=0.3, random_state=1)

In [19]:
# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [20]:
# Make predictions on the test set
predicted = clf.predict(X_test)

In [21]:
# Calculate and print model accuracy
accuracy = metrics.accuracy_score(y_test, predicted)
print("MultinomialNB Accuracy:", accuracy)

MultinomialNB Accuracy: 0.6077149814174035


In [22]:
# Use TF-IDF Vectorizer to transform text data
tfidf = TfidfVectorizer()
text_tfidf = tfidf.fit_transform(data['Phrase'])

In [23]:
# Print transformed text feature matrix
print(text_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 972101 stored elements and shape (156058, 15240)>
  Coords	Values
  (0, 11837)	0.1761994815877001
  (0, 9227)	0.27061614383578647
  (0, 4577)	0.2785389411949561
  (0, 3490)	0.24850612693023294
  (0, 13505)	0.17689948283394402
  (0, 288)	0.25113431984806095
  (0, 13503)	0.08982495481054026
  (0, 14871)	0.1354415143200978
  (0, 7217)	0.17522895610107117
  (0, 5821)	0.26253021424806067
  (0, 5323)	0.20344749300073856
  (0, 5837)	0.22883824624854973
  (0, 529)	0.16143822063670707
  (0, 5595)	0.2657965179233486
  (0, 12424)	0.13815927559750085
  (0, 14888)	0.2870192587636
  (0, 9204)	0.19301342336433744
  (0, 602)	0.263419028234781
  (0, 1879)	0.11034429613283914
  (0, 9085)	0.18985163231762975
  (0, 593)	0.22068918609103486
  (0, 13681)	0.07615269515791664
  (0, 8807)	0.13538792727190468
  (0, 12857)	0.12785633223319684
  (1, 11837)	0.29125925804739433
  :	:
  (156048, 625)	0.21157194820303643
  (156048, 11465)	0.670263797515514