In [13]:
import pandas as pd                        
import numpy as np                         
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [14]:
data = pd.read_csv(f"/Users/krishnakumar/Desktop/AMLT/svm.csv")
print(data.head())

                                 text    category author source_type  \
0         Tech conference held online  technology  Rahul      report   
1      Scientists discover new planet  technology  Maria        news   
2   Software update improves security  technology  Maria      social   
3  Internet speed improved nationwide  technology     Li        blog   
4        College sports day organized      sports  Kumar        news   

  sentiment  text_length  word_count  has_number  contains_ai  day_posted  \
0  positive           27           4           0            0          14   
1   neutral           30           4           0            0          27   
2   neutral           33           4           0            0           1   
3  positive           34           4           0            0          13   
4  positive           28           4           0            0          30   

   month_posted  read_time_min  
0             9            1.2  
1             5            1.2  
2    

In [15]:
X = data.drop("category", axis=1)

y = data["category"]

In [16]:
text_col = "text"

cat_cols = ["author", "source_type", "sentiment"]

num_cols = ["text_length", "word_count", "has_number",
            "contains_ai", "day_posted", "month_posted",
            "read_time_min"]



In [17]:
preprocessor = ColumnTransformer(

    transformers=[

        ("tfidf", TfidfVectorizer(stop_words="english"), "text"),

        ("cat",
         OneHotEncoder(handle_unknown="ignore"),
         ["author", "source_type", "sentiment"]),

        ("num",
         StandardScaler(),
         ["text_length", "word_count", "has_number",
          "contains_ai", "day_posted", "month_posted",
          "read_time_min"])
    ]
)


In [18]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)



In [19]:
svm_model = Pipeline(steps=[

    ("preprocess", preprocessor),

    ("classifier", SVC(kernel="linear"))
])

svm_model.fit(X_train, y_train)



0,1,2
,steps,"[('preprocess', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('tfidf', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [8]:
y_pred = svm_model.predict(X_test)

In [9]:
print("Accuracy:", accuracy_score(y_test, y_pred))

print("Precision:",
      precision_score(y_test, y_pred, average="weighted"))

print("Recall:",
      recall_score(y_test, y_pred, average="weighted"))

print("F1 Score:",
      f1_score(y_test, y_pred, average="weighted"))

print("\nDetailed Report\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.9166666666666666
Precision: 0.9285714285714285
Recall: 0.9166666666666666
F1 Score: 0.8990384615384616

Detailed Report

              precision    recall  f1-score   support

    politics       1.00      0.33      0.50         3
      sports       1.00      1.00      1.00         9
  technology       0.86      1.00      0.92        12

    accuracy                           0.92        24
   macro avg       0.95      0.78      0.81        24
weighted avg       0.93      0.92      0.90        24



In [10]:
param_grid = {
    "classifier__C": [0.1, 1, 10],
    "classifier__kernel": ["linear", "rbf"]
}

grid = GridSearchCV(
    svm_model,
    param_grid,
    cv=3
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)



Best Parameters: {'classifier__C': 1, 'classifier__kernel': 'linear'}


In [11]:

cv_scores = cross_val_score(svm_model, X, y, cv=5)

print("Cross Validation Accuracy:", np.mean(cv_scores))



Cross Validation Accuracy: 0.9583333333333333


In [12]:


rf_model = Pipeline(steps=[

    ("preprocess", preprocessor),

    ("classifier", RandomForestClassifier())
])

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

print("\nRandom Forest Accuracy:",
      accuracy_score(y_test, rf_pred))


Random Forest Accuracy: 0.9166666666666666
