In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from google.colab import files

uploaded = files.upload()

# Load the mobile data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Concatenate train and test data
data = pd.concat([train, test], axis=0)

# Separate features and target variable
X = data.drop(columns=['Label'])
y = data['Label']

# Create a single train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize variables to track the best model and its accuracy
best_accuracy = 0
best_params = {'degree': None, 'C': None}

# Iterate over degrees 1 through 4 and C values from 60 to 80 with increments of 0.5
degree_range = range(1, 5)
C_range = [c / 2 for c in range(120, 161)]  # Generates values 60, 60.5, ..., 80

for degree in degree_range:
    for C in C_range:
        # Initialize the SVC with polynomial kernel
        model = SVC(kernel='poly', degree=degree, C=C, random_state=123)

        # Train the model on the scaled training data
        model.fit(X_train_scaled, y_train)

        # Predict on the test set and calculate accuracy
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"Degree={degree}, C={C:.1f}, Test Accuracy: {accuracy:.4f}")

        # Update the best parameters if current model is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params['degree'] = degree
            best_params['C'] = C

# Output the best results
print("\nBest parameters found:")
print(f"Degree={best_params['degree']}, C={best_params['C']}, Test Accuracy: {best_accuracy:.4f}")

Saving test.csv to test (1).csv
Saving train.csv to train (1).csv
Degree=1, C=60.0, Test Accuracy: 0.9333
Degree=1, C=60.5, Test Accuracy: 0.9333
Degree=1, C=61.0, Test Accuracy: 0.9333
Degree=1, C=61.5, Test Accuracy: 0.9333
Degree=1, C=62.0, Test Accuracy: 0.9333
Degree=1, C=62.5, Test Accuracy: 0.9333
Degree=1, C=63.0, Test Accuracy: 0.9333
Degree=1, C=63.5, Test Accuracy: 0.9333
Degree=1, C=64.0, Test Accuracy: 0.9333
Degree=1, C=64.5, Test Accuracy: 0.9333
Degree=1, C=65.0, Test Accuracy: 0.9333
Degree=1, C=65.5, Test Accuracy: 0.9333
Degree=1, C=66.0, Test Accuracy: 0.9333
Degree=1, C=66.5, Test Accuracy: 0.9333
Degree=1, C=67.0, Test Accuracy: 0.9333
Degree=1, C=67.5, Test Accuracy: 0.9333
Degree=1, C=68.0, Test Accuracy: 0.9333
Degree=1, C=68.5, Test Accuracy: 0.9333
Degree=1, C=69.0, Test Accuracy: 0.9333
Degree=1, C=69.5, Test Accuracy: 0.9333
Degree=1, C=70.0, Test Accuracy: 0.9333
Degree=1, C=70.5, Test Accuracy: 0.9333
Degree=1, C=71.0, Test Accuracy: 0.9333
Degree=1, C=71

In [13]:
best_accuracy = 0
best_C = None
best_degree = None

# Iterate over degrees 1 through 4 and C values from 60 to 80 with increments of 0.5
degree_range = range(1, 5)
C_range = np.arange(60, 92.5, 0.5)  # Generates values 60, 60.5, ..., 80

for degree in degree_range:
    for C in C_range:
        # Initialize the SVC with polynomial kernel
        model = SVC(kernel='poly', degree=degree, C=C, random_state=123)

        # Train the model on the scaled training data
        model.fit(X_train_scaled, y_train)

        # Predict on the test set and calculate accuracy
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        # Update the best parameters if the current model is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_C = C
            best_degree = degree

# Output the best results
print("\nBest parameters found:")
print(f"Degree={best_degree}, C={best_C}, Test Accuracy: {best_accuracy:.4f}")


Best parameters found:
Degree=1, C=60.0, Test Accuracy: 0.9333


In [15]:
# Initialize variables to track the best model and its accuracy
best_accuracy = 0
best_params = {'gamma': None, 'C': None}

# Define ranges for gamma and C
gamma_range = np.arange(0.0005, 0.0101, 0.0001)
C_range = np.arange(1, 75.5, 0.5)

# Iterate over all combinations of gamma and C
for gamma in gamma_range:
    for C in C_range:
        # Initialize the SVC with Gaussian (RBF) kernel
        model = SVC(kernel='rbf', gamma=gamma, C=C, random_state=123)

        # Train the model on the scaled training data
        model.fit(X_train_scaled, y_train)

        # Predict on the test set and calculate accuracy
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Gamma={gamma:.4f}, C={C:.1f}, Test Accuracy: {accuracy:.4f}")

        # Update best parameters if current model is better
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params['gamma'] = gamma
            best_params['C'] = C

# Output the best results
print(f"Best gamma: {best_params['gamma']}")
print(f"Best C: {best_params['C']}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Gamma=0.0067, C=35.0, Test Accuracy: 0.9667
Gamma=0.0067, C=35.5, Test Accuracy: 0.9667
Gamma=0.0067, C=36.0, Test Accuracy: 0.9667
Gamma=0.0067, C=36.5, Test Accuracy: 0.9667
Gamma=0.0067, C=37.0, Test Accuracy: 0.9667
Gamma=0.0067, C=37.5, Test Accuracy: 0.9667
Gamma=0.0067, C=38.0, Test Accuracy: 0.9667
Gamma=0.0067, C=38.5, Test Accuracy: 0.9667
Gamma=0.0067, C=39.0, Test Accuracy: 0.9667
Gamma=0.0067, C=39.5, Test Accuracy: 0.9667
Gamma=0.0067, C=40.0, Test Accuracy: 0.9667
Gamma=0.0067, C=40.5, Test Accuracy: 0.9667
Gamma=0.0067, C=41.0, Test Accuracy: 0.9667
Gamma=0.0067, C=41.5, Test Accuracy: 0.9667
Gamma=0.0067, C=42.0, Test Accuracy: 0.9667
Gamma=0.0067, C=42.5, Test Accuracy: 0.9667
Gamma=0.0067, C=43.0, Test Accuracy: 0.9667
Gamma=0.0067, C=43.5, Test Accuracy: 0.9667
Gamma=0.0067, C=44.0, Test Accuracy: 0.9667
Gamma=0.0067, C=44.5, Test Accuracy: 0.9667
Gamma=0.0067, C=45.0, Test Accuracy: 0.9667
Gamma=0.006

In [16]:
import numpy as np
from collections import Counter

# Count the number of predictions for each class
predicted_counts = Counter(y_pred)

print("Number of instances predicted for each class:")
for label, count in predicted_counts.items():
    print(f"Class {label}: {count}")

Number of instances predicted for each class:
Class Wikipedia: 8
Class ChatGPT: 5
Class Reddit: 6
Class LinkedIn: 11


In [17]:
# GridSearch for SVM
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'kernel': ['poly'],
    'degree': [1, 2, 3, 4],
    'C': [c / 2 for c in range(120, 161)],  # 60 to 80 by 0.5
}

# Create the SVM model
model = SVC(random_state=123)

# Set up the GridSearch
grid = GridSearchCV(model, param_grid, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to training data
grid.fit(X_train_scaled, y_train)

# After fitting, best parameters are:
print(f"Best parameters: {grid.best_params_}")
print(f"Best CV score: {grid.best_score_:.4f}")

# Evaluate on test set
y_pred = grid.predict(X_test_scaled)
print("\nTest set classification report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 164 candidates, totalling 820 fits
Best parameters: {'C': 60.0, 'degree': 1, 'kernel': 'poly'}
Best CV score: 0.8000

Test set classification report:
              precision    recall  f1-score   support

     ChatGPT       1.00      1.00      1.00         5
    LinkedIn       0.83      1.00      0.91        10
      Reddit       1.00      0.71      0.83         7
   Wikipedia       1.00      1.00      1.00         8

    accuracy                           0.93        30
   macro avg       0.96      0.93      0.94        30
weighted avg       0.94      0.93      0.93        30

