In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import os
import string
import joblib
from text_loader.loader_new import DataLoader
import warnings 
warnings.filterwarnings("ignore")

In [2]:
loader_new = DataLoader("../data/Tweets.csv", True)

In [3]:
df = loader_new.load_data()
df.head(10)

Unnamed: 0,Party,Tweet
0,Republican,Joined Manatee Educational TV today to talk op...
1,Democrat,RT @CHOPadvocacy: Thank you to @RepDwightEvans...
2,Democrat,Mission Accomplished? Not so easy Mr. Preside...
3,Republican,Joined my @Transport colleague @RepEsty to int...
4,Republican,RT @HouseGOP: The 115th Congress is working ha...
5,Republican,Today I met with @CBAGeorgia to talk about the...
6,Democrat,More proof that the fight to #SaveNetNeutralit...
7,Republican,Joined Michael Ball and Christy Kenady from @S...
8,Republican,RT @AACR: Chairman @TomColeOK04 has been a lon...
9,Democrat,"Last week, Senator @CoryBooker and I introduce..."


In [4]:
df.shape

(86461, 2)

In [5]:
#Checking for labels
df['Party'].value_counts()

Party
Republican                 44392
Democrat                   42068
https://t.co/sbSljdw1Zw        1
Name: count, dtype: int64

In [6]:
df[df['Tweet'].isna()]

Unnamed: 0,Party,Tweet
39172,https://t.co/sbSljdw1Zw,


In [7]:
#Checking if Tweet contains only strings
non_str_val = df[~df['Tweet'].apply(lambda x: isinstance(x, str))]

In [8]:
print(non_str_val)

                         Party Tweet
39172  https://t.co/sbSljdw1Zw   NaN


In [9]:
#Since row is just one deleting this row
df = df[df['Tweet'].apply(lambda x: isinstance(x, str))]

In [10]:
df['Party'].value_counts()

Party
Republican    44392
Democrat      42068
Name: count, dtype: int64

## Clean Text

In [11]:
df['Tweet'] = df['Tweet'].apply(loader_new.clean_text)
df.head()

Unnamed: 0,Party,Tweet
0,Republican,Joined Manatee Educational TV today to talk op...
1,Democrat,RT CHOPadvocacy Thank you to RepDwightEvans fo...
2,Democrat,Mission Accomplished Not so easy Mr President...
3,Republican,Joined my Transport colleague RepEsty to intro...
4,Republican,RT HouseGOP The 115th Congress is working hard...


## Encoding the label(y)

In [12]:
df['Party'] = loader_new.label_encoder(df['Party'].tolist())
df.head()

Unnamed: 0,Party,Tweet
0,1,Joined Manatee Educational TV today to talk op...
1,0,RT CHOPadvocacy Thank you to RepDwightEvans fo...
2,0,Mission Accomplished Not so easy Mr President...
3,1,Joined my Transport colleague RepEsty to intro...
4,1,RT HouseGOP The 115th Congress is working hard...


## Republican = 1 and Democrat = 0

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Party'], stratify=df['Party'], test_size = 0.2 , random_state = 42)

In [14]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((69168,), (17292,), (69168,), (17292,))

In [15]:
y_train.value_counts(), y_test.value_counts()

(Party
 1    35514
 0    33654
 Name: count, dtype: int64,
 Party
 1    8878
 0    8414
 Name: count, dtype: int64)

In [16]:
X_train.head()

26035    As a proud Pi Kappa Alpha I was delighted to c...
39608    Lyme disease and other tickborne related illne...
84651    AngieRowe Thx 4 great healthcare discussion Iv...
35526    RT FoxNews RepLeeZeldin “All signals is that t...
77900    On with ProudExDemocrat now  talking tax refor...
Name: Tweet, dtype: object

## Vectorizing the X(Tweet)

In [17]:
#Assigning tweets to X
X_train_vector = loader_new.vectorize_text(X_train.tolist(), fit=True)
X_test_vector = loader_new.vectorize_text(X_test.tolist(), fit=False)

In [18]:
X_train_vector.shape,y_train.shape,X_test_vector.shape,y_test.shape

((69168, 2500), (69168,), (17292, 2500), (17292,))

In [19]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric = 'logloss', max_depth = 10)
model.fit(X_train_vector, y_train)

In [20]:
preds = model.predict(X_test_vector)

In [21]:
preds.shape

(17292,)

In [22]:
acc = accuracy_score(y_test, preds)

In [23]:
print(acc)

0.7102706453851492


In [29]:
print("Vectorizer loaded:", type(loader_new.vectorizer))
print("Has IDF:", hasattr(loader_new.vectorizer, "idf_"))
print("Vocabulary size:", len(loader_new.vectorizer.vocabulary_))

Vectorizer loaded: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Has IDF: True
Vocabulary size: 2500


In [24]:
os.makedirs("model-inference-endpoint/saved_model", exist_ok=True)
os.makedirs("app/saved_model", exist_ok=True)

joblib.dump(model, "model-inference-endpoint/saved_model/model.pkl")
joblib.dump(loader_new.vectorizer, "model-inference-endpoint/saved_model/vectorizer.pkl")

joblib.dump(model, "app/saved_model/model.pkl")
joblib.dump(loader_new.vectorizer, "app/saved_model/vectorizer.pkl")

['app/saved_model/vectorizer.pkl']

In [25]:
vectorizer = joblib.load("app/saved_model/vectorizer.pkl")

In [30]:
print("Vectorizer loaded:", type(vectorizer))
print("Has IDF:", hasattr(vectorizer, "idf_"))
print("Vocabulary size:", len(vectorizer.vocabulary_))

Vectorizer loaded: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Has IDF: True
Vocabulary size: 2500


In [26]:
tfidf_matrix=vectorizer.transform(["RepTomReed: Thanks to my good friend"])

In [27]:
vectorizer

In [28]:
print(tfidf_matrix.toarray())

[[0. 0. 0. ... 0. 0. 0.]]


In [32]:
import sklearn
print(sklearn.__version__)

1.6.1


In [33]:
!pip install scikit-learn==1.4.2

Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Downloading scikit_learn-1.4.2-cp312-cp312-macosx_12_0_arm64.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.4.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
