In [1]:
import pandas as pd

In [10]:
df = pd.read_csv(r"C:\Users\User\Desktop\projects\Sentiment project\backend\train.tsv\train.tsv",  sep='\t')

In [6]:
df.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [9]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer 

# Download NLTK data (if needed)
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')


# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalpha()]  # Remove non-alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['Processed_Phrase'] = df['Phrase'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = df['Processed_Phrase']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Vectorization (using a simple bag-of-words approach)
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Model Building: Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Model Evaluation
y_pred = nb_classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict sentiment for new input
def predict_sentiment(input_text):
    input_text = preprocess_text(input_text)
    input_vectorized = vectorizer.transform([input_text])
    predicted_sentiment = nb_classifier.predict(input_vectorized)[0]
    return predicted_sentiment

# Predict sentiment for new input phrases
new_inputs = ["This movie was fantastic!", "The acting was terrible."]
for input_text in new_inputs:
    predicted_sentiment = predict_sentiment(input_text)
    print(f"Sentence: '{input_text}', Predicted Sentiment: {predicted_sentiment}")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.6098936306548763
Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.27      0.35      1416
           1       0.51      0.40      0.45      5527
           2       0.68      0.80      0.73     15639
           3       0.52      0.49      0.51      6707
           4       0.49      0.29      0.36      1923

    accuracy                           0.61     31212
   macro avg       0.53      0.45      0.48     31212
weighted avg       0.59      0.61      0.59     31212

Sentence: 'This movie was fantastic!', Predicted Sentiment: 3
Sentence: 'The acting was terrible.', Predicted Sentiment: 0


In [11]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK data (if needed)
nltk.download('punkt')
nltk.download('stopwords')



# Map sentiment labels to 0-4 scale
sentiment_mapping = {
    0: 'negative',
    1: 'somewhat negative',
    2: 'neutral',
    3: 'somewhat positive',
    4: 'positive'
}

df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)

# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    tokens = [t for t in tokens if t.isalpha()]  # Remove non-alphabetic tokens
    tokens = [t for t in tokens if t not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

df['Processed_Phrase'] = df['Phrase'].apply(preprocess_text)

# Split the dataset into training and testing sets
X = df['Processed_Phrase']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Building: Logistic Regression as an example
model = LogisticRegression(max_iter=5000)  # Increase max_iter for convergence
model.fit(X_train_tfidf, y_train)

# Model Evaluation
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict sentiment for new input
def predict_sentiment(input_text):
    input_text = preprocess_text(input_text)
    input_vectorized = vectorizer.transform([input_text])
    predicted_sentiment = model.predict(input_vectorized)[0]
    return predicted_sentiment

# Predict sentiment for new input phrases
new_inputs = ["This movie was fantastic!", "The acting was terrible."]
for input_text in new_inputs:
    predicted_sentiment = predict_sentiment(input_text)
    print(f"Sentence: '{input_text}', Predicted Sentiment: {predicted_sentiment}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
