In [217]:
import pandas as pd
import re
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Ferdinand
[nltk_data]     lowata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ferdinand
[nltk_data]     lowata\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [219]:
df = pd.read_csv("Tweets.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


In [220]:
df.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [221]:
df = df.dropna()

df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [222]:
X = df['text']
y = df['sentiment'] 

le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [223]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [224]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'\@w+|\#','', text)
        text = re.sub(r'[^a-z\s]', '', text)
        tokens = text.split()
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.clean_text).tolist()  # ← Perbaikan penting


In [225]:
pipeline = Pipeline([
    ('preprocess', TextPreprocessor()),
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
    ('model', LinearSVC(C=0.1, class_weight='balanced', max_iter=1000))
])

In [227]:
pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

print("\nTraining Accuracy:", accuracy_score(y_train, y_train_pred))
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Testing):")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))


Training Accuracy: 0.7759929299230609
Testing Accuracy: 0.6851043182920912

Classification Report (Testing):
              precision    recall  f1-score   support

    negative       0.68      0.61      0.64      2356
     neutral       0.64      0.70      0.67      3343
    positive       0.75      0.74      0.75      2545

    accuracy                           0.69      8244
   macro avg       0.69      0.68      0.69      8244
weighted avg       0.69      0.69      0.68      8244



In [228]:
joblib.dump(pipeline, 'sentiment_pipeline.pkl')
joblib.dump(le, 'label_encoder.pkl')

['label_encoder.pkl']