In [1]:
import pandas as pd

In [7]:
df = pd.read_csv(r"C:\Users\User\Desktop\projects\Sentiment project\backend\train.tsv\train.tsv",  sep='\t')

In [6]:
df.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [9]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer 

# Download NLTK data (if needed)
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')


# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalpha()]  # Remove non-alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['Processed_Phrase'] = df['Phrase'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = df['Processed_Phrase']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Vectorization (using a simple bag-of-words approach)
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Model Building: Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Model Evaluation
y_pred = nb_classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict sentiment for new input
def predict_sentiment(input_text):
    input_text = preprocess_text(input_text)
    input_vectorized = vectorizer.transform([input_text])
    predicted_sentiment = nb_classifier.predict(input_vectorized)[0]
    return predicted_sentiment

# Predict sentiment for new input phrases
new_inputs = ["This movie was fantastic!", "The acting was terrible."]
for input_text in new_inputs:
    predicted_sentiment = predict_sentiment(input_text)
    print(f"Sentence: '{input_text}', Predicted Sentiment: {predicted_sentiment}")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.6098936306548763
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.27      0.35      1416
           1       0.51      0.40      0.45      5527
           2       0.68      0.80      0.73     15639
           3       0.52      0.49      0.51      6707
           4       0.49      0.29      0.36      1923

    accuracy                           0.61     31212
   macro avg       0.53      0.45      0.48     31212
weighted avg       0.59      0.61      0.59     31212

Sentence: 'This movie was fantastic!', Predicted Sentiment: 3
Sentence: 'The acting was terrible.', Predicted Sentiment: 0


In [8]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK data (if needed)
nltk.download('punkt')
nltk.download('stopwords')



# Map sentiment labels to 0-4 scale
sentiment_mapping = {
    0: 'negative',
    1: 'somewhat negative',
    2: 'neutral',
    3: 'somewhat positive',
    4: 'positive'
}

df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)

# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalpha()]  # Remove non-alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['Processed_Phrase'] = df['Phrase'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = df['Processed_Phrase']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Building: Logistic Regression as an example
model = LogisticRegression(max_iter=5000)  # Increase max_iter for convergence
model.fit(X_train_tfidf, y_train)

# Model Evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict sentiment for new input
def predict_sentiment(input_text):
    input_text = preprocess_text(input_text)
    input_vectorized = vectorizer.transform([input_text])
    predicted_sentiment = model.predict(input_vectorized)[0]
    return predicted_sentiment

# Predict sentiment for new input phrases
new_inputs = ["This movie was fantastic!", "The acting was terrible.","The weather ruined our plans for the day","The presentation could have been better, but it wasn't the worst.","The conference started at 10 AM.","The event was enjoyable; I had a good time.","I loved the movie; it was absolutely fantastic!","The play was outstanding, and I would definitely watch it again.","I went for a walk in the park today.","The movie was of average length."
]
for input_text in new_inputs:
    predicted_sentiment = predict_sentiment(input_text)
    print(f"Sentence: '{input_text}', Predicted Sentiment: {predicted_sentiment}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.6271946687171601
Classification Report:
                    precision    recall  f1-score   support

         negative       0.60      0.18      0.28      1416
          neutral       0.66      0.88      0.76     15639
         positive       0.62      0.24      0.34      1923
somewhat negative       0.54      0.35      0.42      5527
somewhat positive       0.57      0.47      0.51      6707

         accuracy                           0.63     31212
        macro avg       0.60      0.42      0.46     31212
     weighted avg       0.61      0.63      0.60     31212

Sentence: 'This movie was fantastic!', Predicted Sentiment: positive
Sentence: 'The acting was terrible.', Predicted Sentiment: negative
Sentence: 'The weather ruined our plans for the day', Predicted Sentiment: neutral
Sentence: 'The presentation could have been better, but it wasn't the worst.', Predicted Sentiment: somewhat negative
Sentence: 'The conference started at 10 AM.', Predicted Sentiment: neutral


In [9]:
# Predict sentiment for new input phrases
new_inputs = ["This movie was fantastic!", "The acting was terrible.","The weather ruined our plans for the day","The presentation could have been better, but it wasn't the worst.","The conference started at 10 AM.","The event was enjoyable; I had a good time.","I loved the movie; it was absolutely fantastic!","The play was outstanding, and I would definitely watch it again.","I went for a walk in the park today.","The movie was of average length.","great","not great","this is awesome"
]
for input_text in new_inputs:
    predicted_sentiment = predict_sentiment(input_text)
    print(f"Sentence: '{input_text}', Predicted Sentiment: {predicted_sentiment}")

Sentence: 'This movie was fantastic!', Predicted Sentiment: positive
Sentence: 'The acting was terrible.', Predicted Sentiment: negative
Sentence: 'The weather ruined our plans for the day', Predicted Sentiment: neutral
Sentence: 'The presentation could have been better, but it wasn't the worst.', Predicted Sentiment: somewhat negative
Sentence: 'The conference started at 10 AM.', Predicted Sentiment: neutral
Sentence: 'The event was enjoyable; I had a good time.', Predicted Sentiment: somewhat positive
Sentence: 'I loved the movie; it was absolutely fantastic!', Predicted Sentiment: positive
Sentence: 'The play was outstanding, and I would definitely watch it again.', Predicted Sentiment: positive
Sentence: 'I went for a walk in the park today.', Predicted Sentiment: neutral
Sentence: 'The movie was of average length.', Predicted Sentiment: neutral
Sentence: 'great', Predicted Sentiment: somewhat positive
Sentence: 'not great', Predicted Sentiment: somewhat positive
Sentence: 'this is

In [13]:
# Predict sentiment for new input phrases
new_inputs = ["The movie was a complete disaster.","I hated the food at that restaurant.","The customer service was awful, very disappointing experience.","The traffic was unbearable, making me late for my appointment.","fuck you"]
for input_text in new_inputs:
    predicted_sentiment = predict_sentiment(input_text)
    print(f"Sentence: '{input_text}', Predicted Sentiment: {predicted_sentiment}")

Sentence: 'The movie was a complete disaster.', Predicted Sentiment: negative
Sentence: 'I hated the food at that restaurant.', Predicted Sentiment: neutral
Sentence: 'The customer service was awful, very disappointing experience.', Predicted Sentiment: negative
Sentence: 'The traffic was unbearable, making me late for my appointment.', Predicted Sentiment: neutral
Sentence: 'fuck you', Predicted Sentiment: neutral


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv(r"C:\Users\User\Desktop\projects\Sentiment project\backend\train.tsv\train.tsv",  sep='\t')

# Map sentiment labels to 0-4 scale
sentiment_mapping = {
    0: 'negative',
    1: 'somewhat negative',
    2: 'neutral',
    3: 'somewhat positive',
    4: 'positive'
}

df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)

# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalpha()]  # Remove non-alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['Processed_Phrase'] = df['Phrase'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = df['Processed_Phrase']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Vectorization using TF-IDF with n-grams (1, 2)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Building: Random Forest Classifier with class weights
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)

# Model Evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


: 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load the dataset
df = pd.read_csv(r"C:\Users\User\Desktop\projects\Sentiment project\backend\train.tsv\train.tsv",  sep='\t')

# Map sentiment labels to 0-4 scale
sentiment_mapping = {
    0: 'negative',
    1: 'somewhat negative',
    2: 'neutral',
    3: 'somewhat positive',
    4: 'positive'
}

df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)

# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalpha()]  # Remove non-alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['Processed_Phrase'] = df['Phrase'].apply(preprocess_text)

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Sentiment'])

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Processed_Phrase'])
X = tokenizer.texts_to_sequences(df['Processed_Phrase'])
X = pad_sequences(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Building: LSTM
embedding_dim = 128
vocab_size = len(tokenizer.word_index) + 1
max_length = X.shape[1]

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=100))
model.add(Dense(5, activation='softmax'))  # 5 classes for sentiment

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64)

# Model Evaluation
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 128)           1861120   
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 5)                 505       
                                                                 
Total params: 1953225 (7.45 MB)
Trainable params: 1953225 (7.45 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 0.6469733863044128
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.42      0.45      2113
           1       0.72      0.80      0.76     23588
           2    

In [7]:
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)
accuracy = accuracy_score(y_test, y_pred)




In [3]:
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6469733863044128
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.42      0.45      2113
           1       0.72      0.80      0.76     23588
           2       0.50      0.50      0.50      2848
           3       0.54      0.47      0.51      8228
           4       0.58      0.51      0.54     10041

    accuracy                           0.65     46818
   macro avg       0.57      0.54      0.55     46818
weighted avg       0.64      0.65      0.64     46818



In [4]:
import pickle

# Save the model to disk
with open('lstm_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [5]:
# Save the tokenizer to a file
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)