In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import os
import string
import joblib
from text_loader.loader_new import DataLoader
import warnings 
warnings.filterwarnings("ignore")

In [2]:
loader_new = DataLoader("../data/Tweets.csv", True)

In [3]:
df = loader_new.load_data()
df.head(10)

Unnamed: 0,Party,Tweet
0,Republican,Joined Manatee Educational TV today to talk op...
1,Democrat,RT @CHOPadvocacy: Thank you to @RepDwightEvans...
2,Democrat,Mission Accomplished? Not so easy Mr. Preside...
3,Republican,Joined my @Transport colleague @RepEsty to int...
4,Republican,RT @HouseGOP: The 115th Congress is working ha...
5,Republican,Today I met with @CBAGeorgia to talk about the...
6,Democrat,More proof that the fight to #SaveNetNeutralit...
7,Republican,Joined Michael Ball and Christy Kenady from @S...
8,Republican,RT @AACR: Chairman @TomColeOK04 has been a lon...
9,Democrat,"Last week, Senator @CoryBooker and I introduce..."


In [4]:
df.shape

(86461, 2)

In [5]:
#Checking for labels
df['Party'].value_counts()

Party
Republican                 44392
Democrat                   42068
https://t.co/sbSljdw1Zw        1
Name: count, dtype: int64

In [6]:
df[df['Tweet'].isna()]

Unnamed: 0,Party,Tweet
39172,https://t.co/sbSljdw1Zw,


In [7]:
#Checking if Tweet contains only strings
non_str_val = df[~df['Tweet'].apply(lambda x: isinstance(x, str))]

In [8]:
print(non_str_val)

                         Party Tweet
39172  https://t.co/sbSljdw1Zw   NaN


In [9]:
#Since row is just one deleting this row
df = df[df['Tweet'].apply(lambda x: isinstance(x, str))]

In [10]:
df['Party'].value_counts()

Party
Republican    44392
Democrat      42068
Name: count, dtype: int64

## Clean Text

In [11]:
df['Tweet'] = df['Tweet'].apply(loader_new.clean_text)
df.head()

Unnamed: 0,Party,Tweet
0,Republican,Joined Manatee Educational TV today to talk op...
1,Democrat,RT CHOPadvocacy Thank you to RepDwightEvans fo...
2,Democrat,Mission Accomplished Not so easy Mr President...
3,Republican,Joined my Transport colleague RepEsty to intro...
4,Republican,RT HouseGOP The th Congress is working hard fo...


## Encoding the label(y)

In [12]:
df['Party'] = loader_new.label_encoder(df['Party'].tolist())
df.head()

Unnamed: 0,Party,Tweet
0,1,Joined Manatee Educational TV today to talk op...
1,0,RT CHOPadvocacy Thank you to RepDwightEvans fo...
2,0,Mission Accomplished Not so easy Mr President...
3,1,Joined my Transport colleague RepEsty to intro...
4,1,RT HouseGOP The th Congress is working hard fo...


## Republican = 1 and Democrat = 0

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Party'], stratify=df['Party'], test_size = 0.2 , random_state = 42)

In [14]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((69168,), (17292,), (69168,), (17292,))

In [15]:
y_train.value_counts(), y_test.value_counts()

(Party
 1    35514
 0    33654
 Name: count, dtype: int64,
 Party
 1    8878
 0    8414
 Name: count, dtype: int64)

In [16]:
X_train.head()

26035    As a proud Pi Kappa Alpha I was delighted to c...
39608    Lyme disease and other tickborne related illne...
84651    AngieRowe Thx  great healthcare discussion Ive...
35526    RT FoxNews RepLeeZeldin “All signals is that t...
77900    On with ProudExDemocrat now  talking tax refor...
Name: Tweet, dtype: object

## Vectorizing the X(Tweet)

In [17]:
#Assigning tweets to X
X_train_vector = loader_new.vectorize_text(X_train.tolist(), fit=True)
X_test_vector = loader_new.vectorize_text(X_test.tolist(), fit=False)

In [18]:
X_train_vector.shape,y_train.shape,X_test_vector.shape,y_test.shape

((69168, 2500), (69168,), (17292, 2500), (17292,))

In [19]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric = 'logloss', max_depth = 10, objective='binary:logistic')
model.fit(X_train_vector, y_train)

In [20]:
preds = model.predict(X_test_vector)

In [21]:
pd.value_counts(preds)

1    9805
0    7487
Name: count, dtype: int64

In [22]:
X_train.shape, X_train_vector.shape

((69168,), (69168, 2500))

In [23]:
train_preds = model.predict(X_train_vector)

In [24]:
train_preds

array([1, 1, 1, ..., 1, 0, 0])

In [25]:
X_train[26035]

'As a proud Pi Kappa Alpha I was delighted to celebrate the th Anniversary of our brotherhood on the HouseFloor…'

In [26]:
pd.value_counts(train_preds)

1    39245
0    29923
Name: count, dtype: int64

In [27]:
preds.shape

(17292,)

In [28]:
acc = accuracy_score(y_test, preds)

In [29]:
print(acc)

0.7118320610687023


In [30]:
print("Vectorizer loaded:", type(loader_new.vectorizer))
print("Has IDF:", hasattr(loader_new.vectorizer, "idf_"))
print("Vocabulary size:", len(loader_new.vectorizer.vocabulary_))

Vectorizer loaded: <class 'sklearn.feature_extraction.text.TfidfVectorizer'>
Has IDF: True
Vocabulary size: 2500


In [31]:
os.makedirs("model-inference-endpoint/saved_model", exist_ok=True)
os.makedirs("app/saved_model", exist_ok=True)

model.get_booster().save_model("model-inference-endpoint/saved_model/model.json")
#joblib.dump(model, "model-inference-endpoint/saved_model/model.pkl")
joblib.dump(loader_new.vectorizer, "model-inference-endpoint/saved_model/vectorizer.pkl")

model.get_booster().save_model("app/saved_model/model.json")
#joblib.dump(model, "app/saved_model/model.pkl")
joblib.dump(loader_new.vectorizer, "app/saved_model/vectorizer.pkl")

['app/saved_model/vectorizer.pkl']

In [32]:
#!pip install scikit-learn==1.6.1

In [36]:
from text_loader.loader_new import DataLoader
import xgboost as xgb
import joblib

# Load model and vectorizer
booster = xgb.Booster()
booster.load_model("model-inference-endpoint/saved_model/model.json")

vectorizer = joblib.load("model-inference-endpoint/saved_model/vectorizer.pkl")
loader = DataLoader("", False)

# Step 1: Clean your tweet
tweet = "As a proud Pi Kappa Alpha I was delighted to celebrate the th Anniversary of our brotherhood on the HouseFloor…"
cleaned = loader.clean_text(tweet)

# Step 2: Vectorize
vec = vectorizer.transform([cleaned])

# Step 3: Convert to DMatrix
dmatrix = xgb.DMatrix(vec)

# Step 4: Predict using booster
prob_class_1 = booster.predict(dmatrix)[0]  # Republican = 1
pred = int(prob_class_1 >= 0.5)
label = "Republican" if pred == 1 else "Democrat"
confidence = round(prob_class_1 if pred == 1 else 1 - prob_class_1, 3)

print("Cleaned:", cleaned)
print("Prediction:", label)
print("Confidence:", confidence)

Cleaned: As a proud Pi Kappa Alpha I was delighted to celebrate the th Anniversary of our brotherhood on the HouseFloor…
Prediction: Democrat
Confidence: 0.998
