# 3. Model Training and Selection

This section covers:
1. Processing text data using TF-IDF vectorization (output saved for reuse as `tfidf_vectorized.csv`).
2. Using SMOTE to balance training data for underrepresented categories.
3. Training and evaluating various classification models:
   - Random Forest
   - Support Vector Machine (SVM)
   - k-Nearest Neighbors (k-NN)
   - Naive Bayes
   - Logistic Regression
   - Logistic Regression with One-vs-Rest (OvR) strategy.
   - Decision Tree
4. Hyperparameter tuning with GridSearch to optimize performance.

## 3.1 TF-IDF Vectorization

The text data is processed using TF-IDF vectorizer, and the output is saved to `tfidf_vectorized.csv`.


In [5]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the JSON dataset
file_path = 'merged_profiles.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the first few rows to understand the structure
print(df.head())

# Ensure the 'text' column is selected for TF-IDF vectorization
texts = df['text']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    # norm='l2',
    # ngram_range=(1, 2)
)


# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

# Add the category column for reference
tfidf_df['category'] = df['category']

# Save the TF-IDF DataFrame or inspect it
tfidf_df.to_csv('tfidf_vectorized.csv', index=False)
print("TF-IDF vectorization completed and saved.")


          username                                               text  \
0    taskirancemal  cemal taşkıran kaş ah merve çocuk bak ol kork ...   
1    tam_kararinda  kaan yâr milliyet pazar kalori ala fayda deney...   
2         spart4nn  cemil ceylân küçük ev kamp doğa mutfak reklam ...   
3  sosyalyiyiciler  keşif keşif çakal menemen ekstra dahil sınır ç...   
4  sonaydizdarahad  sonay dizdar dağhan eyvallah şanlıurfa ener ar...   

           category  
0  mom and children  
1              food  
2              food  
3              food  
4  mom and children  
TF-IDF vectorization completed and saved.


# Random Forest with SMOTE

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the TF-IDF Vectorized Data
file_path = 'tfidf_vectorized.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('category', axis=1)
y = df['category']

threshold = 200

# Identify categories with less than 200 samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Comment/uncomment the sections below based on your requirement

# 1. Perform Grid Search
# Uncomment this section to find the best parameters using Grid Search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }
# rf = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# print("Best Hyperparameters:", grid_search.best_params_)
# best_rf = grid_search.best_estimator_

# 2. Use Predefined Best Parameters
# Uncomment this section if you want to directly use predefined parameters
best_params = {
    'bootstrap': False,
    'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 10,
    'n_estimators': 200
}
best_rf = RandomForestClassifier(random_state=42, **best_params)
best_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = best_rf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report Random Forest with SMOTE:\n", classification_report(y_test, y_pred))


Accuracy: 0.697560975609756

Classification Report Random Forest with SMOTE:
                       precision    recall  f1-score   support

                 art       0.67      0.15      0.24        41
       entertainment       0.51      0.45      0.47        74
             fashion       0.68      0.73      0.70        66
                food       0.78      0.90      0.84        99
              gaming       1.00      1.00      1.00        36
health and lifestyle       0.54      0.74      0.63       103
    mom and children       0.96      0.64      0.77        39
              sports       0.96      0.75      0.84        36
                tech       0.72      0.81      0.76        77
              travel       0.68      0.61      0.64        44

            accuracy                           0.70       615
           macro avg       0.75      0.68      0.69       615
        weighted avg       0.71      0.70      0.69       615



# SVM with SMOTE

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the TF-IDF Vectorized Data
file_path = 'tfidf_vectorized.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('category', axis=1)
y = df['category']

threshold = 200

# Identify categories with less than 200 samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Comment/uncomment the sections below based on your requirement

# 1. Perform Grid Search
# Uncomment this section to find the best parameters using Grid Search
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'degree': [2, 3, 4],  # Only relevant for 'poly' kernel
#     'gamma': ['scale', 'auto']
# }
# svm = SVC(random_state=42)
# grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# print("Best Hyperparameters:", grid_search.best_params_)
# best_svm = grid_search.best_estimator_

# 2. Use Predefined Best Parameters
# Uncomment this section if you want to directly use predefined parameters
best_params = {'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}

best_svm = SVC(random_state=42, **best_params)
best_svm.fit(X_train, y_train)

# Predict on the test set
y_pred = best_svm.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report SVM with SMOTE:\n", classification_report(y_test, y_pred))


Accuracy: 0.6943089430894309

Classification Report SVM with SMOTE:
                       precision    recall  f1-score   support

                 art       0.43      0.24      0.31        41
       entertainment       0.50      0.50      0.50        74
             fashion       0.63      0.62      0.63        66
                food       0.82      0.84      0.83        99
              gaming       1.00      1.00      1.00        36
health and lifestyle       0.61      0.70      0.65       103
    mom and children       0.91      0.82      0.86        39
              sports       0.96      0.72      0.83        36
                tech       0.71      0.75      0.73        77
              travel       0.60      0.73      0.66        44

            accuracy                           0.69       615
           macro avg       0.72      0.69      0.70       615
        weighted avg       0.70      0.69      0.69       615



# Naive Bayes with SMOTE

In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the TF-IDF Vectorized Data
file_path = 'tfidf_vectorized.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('category', axis=1)
y = df['category']

threshold = 200

# Identify categories with less than 200 samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Comment/uncomment the sections below based on your requirement

# 1. Perform Grid Search
# Uncomment this section to find the best parameters using Grid Search
# param_grid = {
#     'alpha': [0.1, 0.5, 1.0],  # Smoothing parameter
# }
# nb = MultinomialNB()
# grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# print("Best Hyperparameters:", grid_search.best_params_)
# best_nb = grid_search.best_estimator_

# 2. Use Predefined Best Parameters
# Uncomment this section if you want to directly use predefined parameters
best_params = {'alpha': 0.1}  # Default smoothing parameter
best_nb = MultinomialNB(**best_params)
best_nb.fit(X_train, y_train)

# Predict on the test set
y_pred = best_nb.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report Naive Bayes with SMOTE:\n", classification_report(y_test, y_pred))


Accuracy: 0.6894308943089431

Classification Report Naive Bayes with SMOTE:
                       precision    recall  f1-score   support

                 art       0.67      0.20      0.30        41
       entertainment       0.50      0.41      0.45        74
             fashion       0.72      0.67      0.69        66
                food       0.87      0.83      0.85        99
              gaming       1.00      1.00      1.00        36
health and lifestyle       0.55      0.76      0.63       103
    mom and children       0.86      0.64      0.74        39
              sports       0.86      0.67      0.75        36
                tech       0.70      0.83      0.76        77
              travel       0.55      0.75      0.63        44

            accuracy                           0.69       615
           macro avg       0.73      0.67      0.68       615
        weighted avg       0.70      0.69      0.68       615



# Decision Tree with SMOTE

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the TF-IDF Vectorized Data
file_path = 'tfidf_vectorized.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('category', axis=1)
y = df['category']

threshold = 200

# Identify categories with less than 200 samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Comment/uncomment the sections below based on your requirement

# 1. Perform Grid Search
# Uncomment this section to find the best parameters using Grid Search
# param_grid = {
#     'criterion': ['gini', 'entropy'],  # Splitting criterion
#     'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],  # Minimum samples required to split
#     'min_samples_leaf': [1, 2, 4],    # Minimum samples per leaf
# }
# dt = DecisionTreeClassifier(random_state=42)
# grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# print("Best Hyperparameters:", grid_search.best_params_)
# best_dt = grid_search.best_estimator_

# 2. Use Predefined Best Parameters
# Uncomment this section if you want to directly use predefined parameters
best_params = {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
best_dt = DecisionTreeClassifier(random_state=42, **best_params)
best_dt.fit(X_train, y_train)

# Predict on the test set
y_pred = best_dt.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report Decision Tree with SMOTE:\n", classification_report(y_test, y_pred))


Accuracy: 0.4991869918699187

Classification Report Decision Tree with SMOTE:
                       precision    recall  f1-score   support

                 art       0.17      0.20      0.18        41
       entertainment       0.30      0.32      0.31        74
             fashion       0.48      0.50      0.49        66
                food       0.67      0.65      0.66        99
              gaming       0.94      0.94      0.94        36
health and lifestyle       0.44      0.40      0.42       103
    mom and children       0.50      0.62      0.55        39
              sports       0.70      0.44      0.54        36
                tech       0.53      0.61      0.57        77
              travel       0.46      0.36      0.41        44

            accuracy                           0.50       615
           macro avg       0.52      0.50      0.51       615
        weighted avg       0.51      0.50      0.50       615



# k-NN with SMOTE

In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the TF-IDF Vectorized Data
file_path = 'tfidf_vectorized.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('category', axis=1)
y = df['category']

threshold = 200

# Identify categories with less than 200 samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Comment/uncomment the sections below based on your requirement

# 1. Perform Grid Search
# Uncomment this section to find the best parameters using Grid Search
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9],  # Number of neighbors
#     'weights': ['uniform', 'distance'],  # Weighting strategy
#     'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
# }
# knn = KNeighborsClassifier()
# grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# print("Best Hyperparameters:", grid_search.best_params_)
# best_knn = grid_search.best_estimator_

# 2. Use Predefined Best Parameters
# Uncomment this section if you want to directly use predefined parameters
best_params = {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train, y_train)

# Predict on the test set
y_pred = best_knn.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report K-NN with SMOTE:\n", classification_report(y_test, y_pred))


Accuracy: 0.375609756097561

Classification Report K-NN with SMOTE:
                       precision    recall  f1-score   support

                 art       0.55      0.15      0.23        41
       entertainment       0.00      0.00      0.00        74
             fashion       0.67      0.06      0.11        66
                food       0.92      0.36      0.52        99
              gaming       0.97      1.00      0.99        36
health and lifestyle       0.21      0.87      0.33       103
    mom and children       0.88      0.56      0.69        39
              sports       0.89      0.67      0.76        36
                tech       0.60      0.04      0.07        77
              travel       0.34      0.23      0.27        44

            accuracy                           0.38       615
           macro avg       0.60      0.39      0.40       615
        weighted avg       0.56      0.38      0.34       615



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Logistic Rgression with SMOTE

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the TF-IDF Vectorized Data
file_path = 'tfidf_vectorized.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('category', axis=1)
y = df['category']

threshold = 200

# Identify categories with less than 200 samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Comment/uncomment the sections below based on your requirement

# 1. Perform Grid Search
# Uncomment this section to find the best parameters using Grid Search
# param_grid = {
#     'C': [0.1, 1, 10, 100],         # Regularization strength
#     'penalty': ['l2'],              # Regularization type
#     'solver': ['lbfgs', 'saga'],    # Solvers for optimization
#     'multi_class': ['ovr', 'multinomial']  # Multiclass strategy
# }
# lr = LogisticRegression(random_state=42, max_iter=1000)
# grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# print("Best Hyperparameters:", grid_search.best_params_)
# best_lr = grid_search.best_estimator_

# 2. Use Predefined Best Parameters
# Uncomment this section if you want to directly use predefined parameters
best_params = {'C': 10, 'multi_class': 'ovr', 'penalty': 'l2', 'solver': 'saga'}
best_lr = LogisticRegression(random_state=42, max_iter=1000, **best_params)
best_lr.fit(X_train, y_train)

# Predict on the test set
y_pred = best_lr.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report Logistic Regression with SMOTE:\n", classification_report(y_test, y_pred))




Accuracy: 0.7138211382113822

Classification Report Logistic Regression with SMOTE:
                       precision    recall  f1-score   support

                 art       0.41      0.27      0.32        41
       entertainment       0.53      0.49      0.51        74
             fashion       0.73      0.68      0.70        66
                food       0.79      0.90      0.84        99
              gaming       0.97      1.00      0.99        36
health and lifestyle       0.66      0.69      0.67       103
    mom and children       0.91      0.79      0.85        39
              sports       0.97      0.81      0.88        36
                tech       0.67      0.78      0.72        77
              travel       0.66      0.70      0.68        44

            accuracy                           0.71       615
           macro avg       0.73      0.71      0.72       615
        weighted avg       0.71      0.71      0.71       615



# Best Model: Logistic Regression with OvR and SMOTE

This section focuses on:
- Training and optimizing the best-performing model: **Logistic Regression with One-vs-Rest (OvR)**.
- Balancing the data with SMOTE using an optimized threshold of 600.

The final model is in a dedicated directory, called 'model' for reuse.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the TF-IDF Vectorized Data
file_path = 'tfidf_vectorized.csv'
df = pd.read_csv(file_path)

# Separate features and target
X = df.drop('category', axis=1)
y = df['category']

threshold = 600

# Identify categories with less than threhold samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Comment/uncomment the sections below based on your requirement

# 1. Perform Grid Search
# Uncomment this section to find the best parameters using Grid Search
# param_grid = {
#     'estimator__C': [0.1, 1, 10, 100],         # Regularization strength
#     'estimator__penalty': ['l2'],              # Regularization type
#     'estimator__solver': ['lbfgs', 'saga']     # Solvers for optimization
# }
# lr = OneVsRestClassifier(LogisticRegression(random_state=42, max_iter=1000))
# grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# print("Best Hyperparameters:", grid_search.best_params_)
# best_lr = grid_search.best_estimator_

# 2. Use Predefined Best Parameters
# Uncomment this section if you want to directly use predefined parameters
best_params = {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
base_lr = LogisticRegression(random_state=21, max_iter=1000, **best_params)
best_lr = OneVsRestClassifier(base_lr)
best_lr.fit(X_train, y_train)

# Predict on the test set
y_pred = best_lr.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report Logistic Regression with OneVsRest and SMOTE:\n", classification_report(y_test, y_pred))


Accuracy: 0.8975

Classification Report Logistic Regression with OneVsRest and SMOTE:
                       precision    recall  f1-score   support

                 art       0.88      0.93      0.90       106
       entertainment       0.86      0.82      0.84       130
             fashion       0.88      0.90      0.89       135
                food       0.89      0.88      0.89       120
              gaming       0.98      1.00      0.99       124
health and lifestyle       0.77      0.67      0.72       118
    mom and children       0.97      0.98      0.97       121
              sports       0.99      0.99      0.99       101
                tech       0.88      0.93      0.90       124
              travel       0.88      0.88      0.88       121

            accuracy                           0.90      1200
           macro avg       0.90      0.90      0.90      1200
        weighted avg       0.90      0.90      0.90      1200

