### Machine Learning Group Workshop
#### Topic: Sentiment Analysis

#### What libraries are we using?

In [52]:
import pandas as pd  # Handle and manipulate tabular data (e.g., reading datasets, organizing data)
import numpy as np  # Support for numerical operations, array manipulations, and handling missing values

# Scikit-learn modules for feature extraction, model building, and evaluation
from sklearn.feature_extraction.text import TfidfVectorizer  # Convert text data into numerical vectors (TF-IDF)
from sklearn.linear_model import LogisticRegression  # Build a classification model (Logistic Regression)
from sklearn.model_selection import train_test_split  # Split data into training and testing sets
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # Evaluate the model and display metrics

##### Step One: Load our data

In [32]:
# Load the dataset
data = pd.read_csv('path_to_dataset.csv')
data.head()

##### Step Two: Define our targets and features

In [35]:
# Define features and target variables
X = data['text_column']  # Replace with the name of the text column in your dataset
y = data['sentiment_column']  # Replace with the name of the sentiment column in your dataset

##### Step Three: Preprocess text data

In [38]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

##### Step Four: Build and train the model

In [41]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=200)

# Train the model using the training data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)

##### Step Five: Make predictions and evaluate how accurate our model is!

In [46]:
# Predict sentiment labels for the test data
y_pred = model.predict(X_test)

# Calculate accuracy and display a classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7105263157894737
Classification Report:
               precision    recall  f1-score   support

    negative       0.73      0.64      0.68      1001
     neutral       0.64      0.76      0.70      1430
    positive       0.81      0.71      0.76      1103

    accuracy                           0.71      3534
   macro avg       0.73      0.70      0.71      3534
weighted avg       0.72      0.71      0.71      3534

