In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#if the cell errors
#please specify the path for the data to match the current path on your computer.

data = pd.read_csv(r'./Data/twitter_training.csv', encoding='ISO-8859-1', header=None)
data.columns=['TweetID','entity','sentiment','message']

print(data.head())
messages=data["message"]
messages = messages.fillna('')
messages = messages.astype(str)


   TweetID       entity sentiment  \
0     2401  Borderlands  Positive   
1     2401  Borderlands  Positive   
2     2401  Borderlands  Positive   
3     2401  Borderlands  Positive   
4     2401  Borderlands  Positive   

                                             message  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


In [2]:
# print(data.info)
# print(data['sentiment'].value_counts())
# print(messages.isnull().sum())
# print(messages.dtype)
data['message_length'] = messages.apply(len) 
print(data.head())
print(messages.describe())

   TweetID       entity sentiment  \
0     2401  Borderlands  Positive   
1     2401  Borderlands  Positive   
2     2401  Borderlands  Positive   
3     2401  Borderlands  Positive   
4     2401  Borderlands  Positive   

                                             message  message_length  
0  im getting on borderlands and i will murder yo...              53  
1  I am coming to the borders and I will kill you...              51  
2  im getting on borderlands and i will kill you ...              50  
3  im coming on borderlands and i will murder you...              51  
4  im getting on borderlands 2 and i will murder ...              57  
count     74682
unique    69492
top            
freq        686
Name: message, dtype: object


In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweet(tweet):
    # Remove URLs, mentions, hashtags, and special characters
    tweet = re.sub(r'http\S+|www\S+|@\S+|#\S+|[^A-Za-z0-9\s]+', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Tokenize and remove stopwords
    words = [word for word in tweet.split() if word not in stop_words]
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

data['cleaned_tweet'] = messages.apply(preprocess_tweet)
print(data['cleaned_tweet'].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0      im getting borderland murder
1                coming border kill
2        im getting borderland kill
3       im coming borderland murder
4    im getting borderland 2 murder
Name: cleaned_tweet, dtype: object


In [7]:
validation_data = pd.read_csv(r'./Data/twitter_validation.csv', encoding='ISO-8859-1', header = None)
validation_data.columns= ['TweetID','entity','sentiment','message']

#validation_data.head()
validation_data['cleaned_tweet'] = validation_data['message'].apply(preprocess_tweet)


In [None]:
""" 
this is the part that we build a model and convert the texts into features.
"""
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 5000)
X_train = vectorizer.fit_transform(data['cleaned_tweet'])
X_val = vectorizer.transform(validation_data['cleaned_tweet'])
print(f'Training data shape: {X_train.shape}')
print(f'Validation data shape: {X_val.shape}')

Training data shape: (74682, 5000)
Validation data shape: (1000, 5000)


In [None]:

model = LogisticRegression(max_iter = 1000)
model.fit(X_train, data['sentiment'])
y_pred = model.predict(X_val)


In [None]:
accuracy = accuracy_score(validation_data['sentiment'], y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')
print(classification_report(validation_data['sentiment'], y_pred, target_names=['Negative', 'Neutral', 'Positive', 'Irrelevant']))


Validation Accuracy: 80.00%
              precision    recall  f1-score   support

    Negative       0.78      0.70      0.74       172
     Neutral       0.77      0.88      0.82       266
    Positive       0.86      0.74      0.79       285
  Irrelevant       0.79      0.84      0.82       277

    accuracy                           0.80      1000
   macro avg       0.80      0.79      0.79      1000
weighted avg       0.80      0.80      0.80      1000



In [11]:
"now I wanna save this model and take an input from the user in which then tries to categorizes the given review"
import joblib
joblib.dump(model, 'logistic_regression_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [12]:
model = joblib.load('logistic_regression_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
def classify_sentence(sentence):
    sentence_cleaned = preprocess_tweet(sentence)  
    sentence_vectorized = vectorizer.transform([sentence_cleaned])
    prediction = model.predict(sentence_vectorized)

    label_mapping = {0: 'Negative', 1: 'Neutral', 2: 'Positive', 3: 'Irrelevant'}
    return label_mapping[prediction[0]]

In [1]:
import ipywidgets as widgets
from IPython.display import display, clear_output
input_box = widgets.Text(
    placeholder='Enter a sentence...',
    description='Input:',
    disabled=False
)
submit_button = widgets.Button(
    description="Classify",
    button_style='primary'
)
output = widgets.Output()

def classify_sentence(sentence):
    sentence_cleaned = preprocess_tweet(sentence)
    sentence_vectorized = vectorizer.transform([sentence_cleaned])
    prediction = model.predict(sentence_vectorized)

    label_mapping = {0: 'Negative', 1: 'Neutral', 2: 'Positive', 3: 'Irrelevant'}
    if prediction[0] in label_mapping:
        return label_mapping[prediction[0]]
    else:
        return f"{prediction[0]}"

def on_submit(change):
    with output:
        clear_output(wait=True) 
        user_input = input_box.value.strip()  
        if not user_input:
            print("Please enter a valid sentence!")
            return
        if user_input.lower() == 'exit':
            print("Goodbye!")
            return
        try:
            sentiment = classify_sentence(user_input)
            print(f"Predicted Sentiment: {sentiment}")
        except KeyError as e:
            print(f"Error: {e}. Ensure your model outputs match the label mapping.")

submit_button.on_click(on_submit)

display(input_box, submit_button, output)


Text(value='', description='Input:', placeholder='Enter a sentence...')

Button(button_style='primary', description='Classify', style=ButtonStyle())

Output()