## Sentiment Classification Project

In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [9]:
df = pd.read_csv('YoutubeCommentsDataSet_Filtered.csv')

In [7]:
df.shape

(14314, 2)

In [10]:
df.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,i will forever acknowledge this channel with t...,positive
2,apple pay is so convenient secure and easy to ...,positive
3,for now i need both apple pay and the physical...,neutral
4,in the united states we have an abundance of r...,positive


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self  # No fitting required

    def transform(self, X):
        return [self.preprocess(text) for text in X]  # Apply preprocessing to each input text

    def preprocess(self, text):
        tokens = word_tokenize(text.lower())  # Lowercase and tokenize
        filtered_tokens = [token for token in tokens if token not in self.stop_words]  # Remove stopwords
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in filtered_tokens]  # Lemmatize
        return " ".join(lemmatized_tokens)  # Convert list back to string


In [11]:
preprocessor = TextPreprocessorTransformer()

df['Comment'] = df['Comment'].apply(preprocessor.transform)

In [12]:
df.head()

Unnamed: 0,Comment,Sentiment
0,"[l, e, , , , n, , , , f, , r, g, e, , , , h, ,...",neutral
1,"[, , w, , l, l, , f, , r, e, v, e, r, , , c, k...",positive
2,"[, p, p, l, e, , p, , , , , , , , , , c, , n, ...",positive
3,"[f, , r, , n, , w, , , , n, e, e, , , b, , , h...",neutral
4,"[, n, , , h, e, , u, n, , , e, , , , , , , e, ...",positive
