In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# storing the file path in a variable
file_path = "foods.txt"

# Reading the content of the files using readlines
# readlines reads each line in a iteration
# Storing all the data in the variable file_data
with open(file_path, 'r') as file:
    file_data = file.readlines()

In [3]:
# Creating an empty lists of each column of the our dataframe
productIds, userIds, profileNames, helpfulness, scores, times, summaries, texts = ([] for _ in range(8))

# Going through each line of the data and appending them into different columns (separating the data column wise)
for line in file_data:
    if line.startswith('product/productId:'):
        productIds.append(line.split(': ')[1].strip())
    elif line.startswith('review/userId:'):
        userIds.append(line.split(': ')[1].strip())
    elif line.startswith('review/profileName:'):
        profileNames.append(line.split(': ')[1].strip())
    elif line.startswith('review/helpfulness:'):
        helpfulness.append(line.split(': ')[1].strip())
    elif line.startswith('review/score:'):
        scores.append(line.split(': ')[1].strip())
    elif line.startswith('review/time:'):
        times.append(line.split(': ')[1].strip())
    elif line.startswith('review/summary:'):
        summaries.append(line.split(': ')[1].strip())
    elif line.startswith('review/text:'):
        texts.append(line.split(': ')[1].strip())

# checking if the data split correctly into different columns
print("Sameple data of productIds: ", productIds[:5])
print("Sameple data of userIds: ", userIds[:5])
print("Sameple data of profileNames: ", profileNames[:5])
print("Sameple data of helpfulness: ", helpfulness[:5])
print("Sameple data of scores: ", scores[:5])
print("Sameple data of times: ", times[:5])
print("Sameple data of summaries: ", summaries[:5])
print("Sameple data of texts: ", texts[:2])

Sameple data of productIds:  ['B001E4KFG0', 'B00813GRG4', 'B000LQOCH0', 'B000UA0QIQ', 'B006K2ZZ7K']
Sameple data of userIds:  ['A3SGXH7AUHU8GW', 'A1D87F6ZCVE5NK', 'ABXLMWJIXXAIN', 'A395BORC6FGVXV', 'A1UQRSCLF8GW1T']
Sameple data of profileNames:  ['delmartian', 'dll pa', 'Natalia Corres "Natalia Corres"', 'Karl', 'Michael D. Bigham "M. Wassir"']
Sameple data of helpfulness:  ['1/1', '0/0', '1/1', '3/3', '0/0']
Sameple data of scores:  ['5.0', '1.0', '4.0', '2.0', '5.0']
Sameple data of times:  ['1303862400', '1346976000', '1219017600', '1307923200', '1350777600']
Sameple data of summaries:  ['Good Quality Dog Food', 'Not as Advertised', '"Delight" says it all', 'Cough Medicine', 'Great taffy']
Sameple data of texts:  ['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.', 'Prod

In [5]:
# Creating a data frame with the above column data 
foods_df = pd.DataFrame({
    'product/productId': productIds,
    'review/userId': userIds,
    'review/profileName': profileNames,
    'review/helpfulness': helpfulness,
    'review/score': scores,
    'review/time': times,
    'review/summary': summaries,
    'review/text': texts
})
foods_df

Unnamed: 0,product/productId,review/userId,review/profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1/1,5.0,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0/0,1.0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1/1,4.0,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,A395BORC6FGVXV,Karl,3/3,2.0,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0/0,5.0,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...
568449,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0/0,5.0,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0/0,2.0,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2/2,5.0,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1/1,5.0,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [6]:
# Check for missing data
foods_df.isnull().sum()

product/productId     0
review/userId         0
review/profileName    0
review/helpfulness    0
review/score          0
review/time           0
review/summary        0
review/text           0
dtype: int64

In [7]:
# Using Naive Bayes to construct a classifier
# Collecting all the features 
features = ['review/userId', 'review/profileName', 'review/time', 'review/summary', 'review/text']
X = foods_df[features]
y = foods_df['review/score']

# Splitting the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# Text parts of the X data
# X_text = foods_df['review/summary'] + ' ' + foods_df['review/text']

In [29]:
# Vectorizing the data
# Instantiate CountVectorizer (vectorizer)
# Bags of words
vectorizer = CountVectorizer()

# Fit the data
# learning the 'vocabulary'of the data
a = vectorizer.fit(X_train['review/summary'] + ' ' + X_train['review/text'])
vectorizer.fit(X_test['review/summary'] + ' ' + X_test['review/text'])

# check the fitted data (vocabulary)
vectorizer.get_feature_names_out()

# Tranforming the data into a mathematical matrix that tells us the frequency of terms (document-term matrix)
X_train_text = vectorizer.transform(X_train)
X_test_text = vectorizer.transform(X_test)

In [4]:
# Or do both at once
# Instantiate CountVectorizer (vectorizer)
vectorizer = CountVectorizer()

X_train_text = vectorizer.fit_transform(X_train['review/summary'] + ' ' + X_train['review/text'])
# we use the same vectorizer to transform test data. This ensures that the test data is represented in the same numerical format as training data
# vectorizer uses the vocabularly learnt during the  training 
X_test_text = vectorizer.transform(X_test['review/summary'] + ' ' + X_test['review/text'])

NameError: name 'X_train' is not defined

In [10]:
print(type(X_train_text))

<class 'scipy.sparse._csr.csr_matrix'>


In [11]:
# creating a naive bayes classifier
classifier = MultinomialNB(force_alpha=True)

# fit the data to the classifier
classifier.fit(X_train_text, y_train)

In [20]:
# Evaluate the model
y_prediction = classifier.predict(X_test_text)

# check the accuracy
accuracy = accuracy_score(y_test, y_prediction)
print("Accuracy", accuracy)

Accuracy 0.7174195273496184
