In [None]:
#SVM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, mean_squared_error
from textblob import TextBlob

# Load the data
df2 = pd.read_csv('/content/merged_data.csv')

# Handle missing values by imputing with empty strings
df2['Cleaned_Review'].fillna('', inplace=True)

# Split the data into training and testing sets
X = df2['Cleaned_Review']  # Text data
y_sentiment = df2['Sentiment']  # Positive or Negative labels
y_rating = df2['rating']  # Numeric ratings

X_train, X_test, y_sentiment_train, y_sentiment_test, y_rating_train, y_rating_test = train_test_split(
    X, y_sentiment, y_rating, test_size=0.2, random_state=42)

# Create TF-IDF vectors from the text data
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a sentiment analysis model using SVM
sentiment_classifier = SVC(kernel='linear', C=1.0)
sentiment_classifier.fit(X_train_tfidf, y_sentiment_train)

# Predict sentiment on the test set
y_sentiment_pred = sentiment_classifier.predict(X_test_tfidf)

# Evaluate the sentiment analysis model
sentiment_accuracy = accuracy_score(y_sentiment_test, y_sentiment_pred)

# Calculate sentiment scores using TextBlob for the entire dataset
df2['textblob_sentiment'] = df2['Cleaned_Review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Normalize TextBlob sentiment scores to the [0, 1] scale
df2['normalized_sentiment'] = (df2['textblob_sentiment'] + 1) / 2

# Combine the model's predicted sentiment with TextBlob sentiment scores
df2['combined_sentiment'] = (df2['normalized_sentiment'] + (y_sentiment == 'positive').astype(int)) / 2

# Calculate the overall rating score on a scale of 5
df2['rating'] = 5 * df2['combined_sentiment']

# Calculate the Mean Squared Error (MSE) for the ratings
rating_mse = mean_squared_error(y_rating_test, df2.loc[y_rating_test.index, 'rating'])

# Print results
print(f'Sentiment Analysis Accuracy: {sentiment_accuracy}')
print(f'Mean Squared Error for Ratings: {rating_mse}')


Sentiment Analysis Accuracy: 0.8707008016550297
Mean Squared Error for Ratings: 5.034545381435629


In [None]:
#multimonial NB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, mean_squared_error
from textblob import TextBlob

# Load the data
df = pd.read_csv('/content/merged_data.csv')

# Handle missing values by imputing with empty strings
df['Cleaned_Review'].fillna('', inplace=True)

# Split the data into training and testing sets
X = df['Cleaned_Review']  # Text data
y_sentiment = df['Sentiment']  # Positive or Negative labels
y_rating = df['rating']  # Numeric ratings

X_train, X_test, y_sentiment_train, y_sentiment_test, y_rating_train, y_rating_test = train_test_split(X, y_sentiment, y_rating, test_size=0.2, random_state=42)

# Create TF-IDF vectors from the text data
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a sentiment analysis model (e.g., Multinomial Naive Bayes)
sentiment_classifier = MultinomialNB()
sentiment_classifier.fit(X_train_tfidf, y_sentiment_train)

# Predict sentiment on the test set
y_sentiment_pred = sentiment_classifier.predict(X_test_tfidf)

# Evaluate the sentiment analysis model
sentiment_accuracy = accuracy_score(y_sentiment_test, y_sentiment_pred)

# Calculate sentiment scores using TextBlob for the entire dataset
df['textblob_sentiment'] = df['Cleaned_Review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Normalize TextBlob sentiment scores to the [0, 1] scale
df['normalized_sentiment'] = (df['textblob_sentiment'] + 1) / 2

# Combine the model's predicted sentiment with TextBlob sentiment scores
df['combined_sentiment'] = (df['normalized_sentiment'] + (y_sentiment == 'positive').astype(int)) / 2

# Calculate the overall rating score on a scale of 5
df['rating'] = 5 * df['combined_sentiment']

# Calculate the Mean Squared Error (MSE) for the ratings
rating_mse = mean_squared_error(y_rating_test, df2.loc[y_rating_test.index, 'rating'])
# Print results
print(f'Sentiment Analysis Accuracy: {sentiment_accuracy}')
print(f'Mean Squared Error for Ratings: {rating_mse}')

Sentiment Analysis Accuracy: 0.8580294802172227
Mean Squared Error for Ratings: 5.034545381435629


In [None]:
#logistic

from sklearn.linear_model import LogisticRegression

df1 = pd.read_csv('/content/merged_data.csv')
# Assuming df2 is your DataFrame with 'Cleaned_Review', 'Sentiment', and 'rating' columns
df1['Cleaned_Review'].fillna('', inplace=True)

# Split the data into training and testing sets
X = df1['Cleaned_Review']  # Text data
y_sentiment = df1['Sentiment']  # Positive or Negative labels
y_rating = df1['rating']  # Numeric ratings

X_train, X_test, y_sentiment_train, y_sentiment_test, y_rating_train, y_rating_test = train_test_split(
    X, y_sentiment, y_rating, test_size=0.2, random_state=42
)

# Create TF-IDF vectors from the text data
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a logistic regression model
logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train_tfidf, y_sentiment_train)

# Predict sentiment on the test set
y_sentiment_pred = logistic_classifier.predict(X_test_tfidf)

# Evaluate the sentiment analysis model
sentiment_accuracy = accuracy_score(y_sentiment_test, y_sentiment_pred)

# Calculate sentiment scores using TextBlob for the entire dataset
df1['textblob_sentiment'] = df1['Cleaned_Review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Normalize TextBlob sentiment scores to the [0, 1] scale
df1['normalized_sentiment'] = (df1['textblob_sentiment'] + 1) / 2

# Combine the model's predicted sentiment with TextBlob sentiment scores
df1['combined_sentiment'] = (df1['normalized_sentiment'] + (y_sentiment == 'positive').astype(int)) / 2

# Calculate the overall rating score on a scale of 5
df1['rating'] = 5 * df1['combined_sentiment']

# Calculate the Mean Squared Error (MSE) for the ratings
rating_mse = mean_squared_error(y_rating_test, df1.loc[y_rating_test.index, 'rating'])

# Print results
print(f'Sentiment Analysis Accuracy: {sentiment_accuracy}')
print(f'Mean Squared Error for Ratings: {rating_mse}')


Sentiment Analysis Accuracy: 0.873545384018619
Mean Squared Error for Ratings: 5.034545381435629


In [None]:

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

df3 = pd.read_csv('/content/merged_data.csv')
# Assuming df3 is your DataFrame with 'Cleaned_Review', 'Sentiment', and 'rating' columns
df3['Cleaned_Review'].fillna('', inplace=True)

# Split the data into training and testing sets
X = df3['Cleaned_Review']  # Text data
y_sentiment = df3['Sentiment']  # Positive or Negative labels
y_rating = df3['rating']  # Numeric ratings

X_train, X_test, y_sentiment_train, y_sentiment_test, y_rating_train, y_rating_test = train_test_split(
    X, y_sentiment, y_rating, test_size=0.2, random_state=42
)

# Create TF-IDF vectors from the text data
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Random Forest model
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train_tfidf, y_sentiment_train)

# Predict sentiment on the test set
y_sentiment_pred_rf = random_forest_classifier.predict(X_test_tfidf)

# Evaluate the sentiment analysis model
sentiment_accuracy_rf = accuracy_score(y_sentiment_test, y_sentiment_pred_rf)

# Calculate sentiment scores using TextBlob for the entire dataset
df3['textblob_sentiment'] = df3['Cleaned_Review'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Normalize TextBlob sentiment scores to the [0, 1] scale
df3['normalized_sentiment'] = (df3['textblob_sentiment'] + 1) / 2

# Combine the model's predicted sentiment with TextBlob sentiment scores
df3['combined_sentiment_rf'] = (df3['normalized_sentiment'] + (y_sentiment == 'positive').astype(int)) / 2

# Calculate the overall rating score on a scale of 5
df3['rating_rf'] = 5 * df3['combined_sentiment_rf']

# Calculate the Mean Squared Error (MSE) for the ratings
rating_mse_rf = mean_squared_error(y_rating_test, df3.loc[y_rating_test.index, 'rating_rf'])

# Print results for Random Forest
print(f'Random Forest Sentiment Analysis Accuracy: {sentiment_accuracy_rf}')
print(f'Random Forest Mean Squared Error for Ratings: {rating_mse_rf}')

# Train a Gradient Boosting model
gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)
gradient_boosting_classifier.fit(X_train_tfidf, y_sentiment_train)

# Predict sentiment on the test set
y_sentiment_pred_gb = gradient_boosting_classifier.predict(X_test_tfidf)

# Evaluate the sentiment analysis model
sentiment_accuracy_gb = accuracy_score(y_sentiment_test, y_sentiment_pred_gb)

# Combine the model's predicted sentiment with TextBlob sentiment scores
df3['combined_sentiment_gb'] = (df3['normalized_sentiment'] + (y_sentiment == 'positive').astype(int)) / 2

# Calculate the overall rating score on a scale of 5
df3['rating_gb'] = 5 * df3['combined_sentiment_gb']

# Calculate the Mean Squared Error (MSE) for the ratings
rating_mse_gb = mean_squared_error(y_rating_test, df3.loc[y_rating_test.index, 'rating_gb'])

# Print results for Gradient Boosting
print(f'Gradient Boosting Sentiment Analysis Accuracy: {sentiment_accuracy_gb}')
print(f'Gradient Boosting Mean Squared Error for Ratings: {rating_mse_gb}')


Random Forest Sentiment Analysis Accuracy: 0.8647530385311611
Random Forest Mean Squared Error for Ratings: 5.034545381435629
Gradient Boosting Sentiment Analysis Accuracy: 0.852081717093354
Gradient Boosting Mean Squared Error for Ratings: 5.034545381435629
