In [1]:
# imoprting necessary librarie
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.utils import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
zipped_files_train = ['101_150.zip',
 '1_50.zip',
  '151_200.zip',
 '201_250.zip',
 '251_300.zip',
 '51_100.zip',
 ]
zipped_files_test =  ['301_350.zip',
 '351_400.zip',
 '401_450.zip',
 '451_475.zip',
 ]


In [5]:
for zip_file in zipped_files_train:
  !unzip "/content/drive/MyDrive/{zip_file}" -d "/content/Images/train"
for zip_file in zipped_files_test:
  !unzip "/content/drive/MyDrive/{zip_file}" -d "/content/Images/test"

Archive:  /content/drive/MyDrive/101_150.zip
  inflating: /content/Images/train/0101_1.jpg  
  inflating: /content/Images/train/0101_2.jpg  
  inflating: /content/Images/train/0101_3.jpg  
  inflating: /content/Images/train/0101_4.jpg  
  inflating: /content/Images/train/0102_1.jpg  
  inflating: /content/Images/train/0102_2.jpg  
  inflating: /content/Images/train/0102_3.jpg  
  inflating: /content/Images/train/0102_4.jpg  
  inflating: /content/Images/train/0103_1.jpg  
  inflating: /content/Images/train/0103_2.jpg  
  inflating: /content/Images/train/0103_3.jpg  
  inflating: /content/Images/train/0103_4.jpg  
  inflating: /content/Images/train/0104_1.jpg  
  inflating: /content/Images/train/0104_2.jpg  
  inflating: /content/Images/train/0104_3.jpg  
  inflating: /content/Images/train/0104_4.jpg  
  inflating: /content/Images/train/0105_1.jpg  
  inflating: /content/Images/train/0105_2.jpg  
  inflating: /content/Images/train/0105_3.jpg  
  inflating: /content/Images/train/0105_4.j

In [6]:
import shutil

images = os.listdir('/content/Images/train')
for image in images:
    image_name = int(image.split("_")[0])
    if image_name > 282:
        shutil.move(f"/content/Images/train/{image}",f"/content/Images/test/{image}")

In [7]:
train_images = os.listdir('/content/Images/train')
test_images = os.listdir('/content/Images/test')

In [8]:
len(train_images)

1128

In [9]:
len(test_images)

772

In [10]:
import cv2  # OpenCV for image processing
import numpy as np  # NumPy for numerical operations
from skimage.feature import graycomatrix, graycoprops, hog  # For GLCM and HOG features

In [11]:
# Preprocessing function
def preprocess_image(image):
    """
    Preprocess the input image for feature extraction.
    Includes adaptive thresholding, edge detection, and histogram equalization.
    """
    # Step 1: Adaptive thresholding for binarization
    adaptive_binary = cv2.adaptiveThreshold(
        image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
    )

    # Step 2: Edge detection using Canny
    edges = cv2.Canny(adaptive_binary, 100, 200)

    # Step 3: Histogram equalization for contrast enhancement
    equalized_image = cv2.equalizeHist(image)

    return adaptive_binary, edges, equalized_image

# Zoning, GLCM, and HOG feature extraction
def extract_advanced_features(binary_image):
    """
    Extracts advanced features including zoning, GLCM, and HOG from the binary image.
    """
    # Resize for consistent feature extraction
    resized_image = cv2.resize(binary_image, (256, 256), interpolation=cv2.INTER_AREA)

    features = {}

    # 1. Zoning Features (Divide into 4x4 grid and calculate density in each zone)
    zones = 4
    zone_height, zone_width = resized_image.shape[0] // zones, resized_image.shape[1] // zones
    zone_features = []
    for i in range(zones):
        for j in range(zones):
            zone = resized_image[i * zone_height:(i + 1) * zone_height, j * zone_width:(j + 1) * zone_width]
            zone_density = np.sum(zone == 255) / (zone_height * zone_width)
            zone_features.append(zone_density)
    features.update({f'zone_{i}': val for i, val in enumerate(zone_features)})

    # 2. GLCM Texture Features
    glcm = graycomatrix(resized_image, [1], [0], 256, symmetric=True, normed=True)
    features['glcm_contrast'] = graycoprops(glcm, 'contrast')[0, 0]
    features['glcm_correlation'] = graycoprops(glcm, 'correlation')[0, 0]
    features['glcm_homogeneity'] = graycoprops(glcm, 'homogeneity')[0, 0]
    features['glcm_energy'] = graycoprops(glcm, 'energy')[0, 0]

    return features

# HOG feature extraction with high resolution
def extract_hog_features(binary_image, target_resolution=(512, 512)):
    """
    Extracts HOG features from a high-resolution resized binary image.
    """
    high_res_image = cv2.resize(binary_image, target_resolution, interpolation=cv2.INTER_AREA)
    hog_features, _ = hog(high_res_image, orientations=8, pixels_per_cell=(16, 16),
                          cells_per_block=(2, 2), visualize=True, feature_vector=True)
    return {f'hog_{i}': val for i, val in enumerate(hog_features[:20])}  # Include first 20 HOG features

# Complete feature extraction pipeline
def extract_features(image_path):
    """
    Complete feature extraction pipeline for a given image path.
    Combines preprocessing, zoning, GLCM, and HOG feature extraction.
    """
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Preprocess the image
    adaptive_binary, edges, equalized_image = preprocess_image(image)

    # Extract features
    features = extract_advanced_features(adaptive_binary)
    hog_features = extract_hog_features(adaptive_binary)

    # Combine all features
    features.update(hog_features)

    return features



In [12]:
train_outputs = pd.read_csv('/content/drive/MyDrive/train_answers.csv')

In [13]:
def get_male_values(writer,train=True):
  """
  Extracts male values from train_outputs based on the provided writer.

  Args:
    writer: The writer ID to filter the DataFrame.

  Returns:
    A list of male values for the specified writer.
    Returns an empty list if no matching writer is found or if an error occurs.
  """
  try:

    # Filter the DataFrame based on the writer ID
    if train:
      train_outputs = pd.read_csv('/content/drive/MyDrive/train_answers.csv')
    else:
      train_outputs = pd.read_csv('/content/drive/MyDrive/test_answers.csv')

    writer_data = train_outputs[train_outputs['writer'] == writer]

    # Extract male values
    male_values = writer_data['male'].tolist()

    return male_values
  except (FileNotFoundError, KeyError) as e:
    print(f"Error: {e}")  # Print the error for debugging purposes
    return []
  except Exception as e:
    print(f"An unexpected error occurred: {e}")
    return []

In [14]:
id = (train_images[279].split("_")[0])
id

'0186'

In [15]:
get_male_values(id)

[]

In [None]:
features = []
outputs = []
train_images = sorted(train_images)
train_dir = "/content/Images/train"
for image in train_images:
  id = int(image.split("_")[0])
  image_path = os.path.join(train_dir, image)
  print(f"{image} under processing")
  feature = extract_features(image_path)
  features.append(feature)
  outputs.append(get_male_values(id))
# Display the extracted features
print("Extracted Features:")
print(features)

In [None]:
np.save('/content/features.npy', features)
np.save('/content/labels.npy', outputs)

In [None]:
len(features)

In [None]:
len(outputs)

In [None]:
outputs

In [None]:
from pickle import TRUE
features = np.load('features.npy',allow_pickle=True)
features[0]

{'zone_0': 0.0,
 'zone_1': 0.0,
 'zone_2': 0.0,
 'zone_3': 0.0,
 'zone_4': 0.0,
 'zone_5': 0.0,
 'zone_6': 0.0,
 'zone_7': 0.0,
 'zone_8': 0.0,
 'zone_9': 0.0,
 'zone_10': 0.0,
 'zone_11': 0.0,
 'zone_12': 0.0,
 'zone_13': 0.0,
 'zone_14': 0.0,
 'zone_15': 0.0,
 'glcm_contrast': 249.71308210784315,
 'glcm_correlation': 0.6039641745646916,
 'glcm_homogeneity': 0.7950779706531294,
 'glcm_energy': 0.7537487718980417,
 'hog_0': 0.0,
 'hog_1': 0.0,
 'hog_2': 0.0,
 'hog_3': 0.0,
 'hog_4': 0.0,
 'hog_5': 0.0,
 'hog_6': 0.0,
 'hog_7': 0.0,
 'hog_8': 0.0,
 'hog_9': 0.0,
 'hog_10': 0.0,
 'hog_11': 0.0,
 'hog_12': 0.0,
 'hog_13': 0.0,
 'hog_14': 0.0,
 'hog_15': 0.0,
 'hog_16': 0.0,
 'hog_17': 0.0,
 'hog_18': 0.0,
 'hog_19': 0.0}

In [None]:
# Flatten each dictionary to a numeric list
features_array = np.array([list(f.values()) for f in features])

# Convert to DataFrame for easier handling
features_df = pd.DataFrame(features_array)

# Handle missing values (if any) - Impute with mean
features_df.fillna(features_df.mean(), inplace=True)

features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.507097,0.151915,0.151915,0.132118
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the extracted features
features = np.load('features.npy', allow_pickle=True)
features_df = pd.DataFrame(features.tolist())
features = np.array([list(f.values()) for f in features])  # Assuming dictionaries have numeric values

# Load the target variable
train_outputs = pd.read_csv('/content/drive/MyDrive/train_answers.csv')
target = train_outputs['male']

# Handle missing values (if any) - Impute with mean
features_df.fillna(features_df.mean(), inplace=True)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
model.fit(features, outputs)


  y = column_or_1d(y, warn=True)


In [None]:
get_male_values(283,train=False)

[1]

In [None]:
test_features = []
test_outputs = []
test_images = sorted(test_images)
train_dir = "/content/Images/test"
for image in test_images:
  id = int(image.split("_")[0])
  image_path = os.path.join(train_dir, image)
  print(f"{image} under processing")
  feature = extract_features(image_path)
  test_features.append(feature)
  test_outputs.append(get_male_values(id,train=False))

0283_1.jpg under processing
0283_2.jpg under processing
0283_3.jpg under processing
0283_4.jpg under processing
0284_1.jpg under processing
0284_2.jpg under processing
0284_3.jpg under processing
0284_4.jpg under processing
0285_1.jpg under processing
0285_2.jpg under processing
0285_3.jpg under processing
0285_4.jpg under processing
0286_1.jpg under processing
0286_2.jpg under processing
0286_3.jpg under processing
0286_4.jpg under processing
0287_1.jpg under processing
0287_2.jpg under processing
0287_3.jpg under processing
0287_4.jpg under processing
0288_1.jpg under processing
0288_2.jpg under processing
0288_3.jpg under processing
0288_4.jpg under processing
0289_1.jpg under processing
0289_2.jpg under processing
0289_3.jpg under processing
0289_4.jpg under processing
0290_1.jpg under processing
0290_2.jpg under processing
0290_3.jpg under processing
0290_4.jpg under processing
0291_1.jpg under processing
0291_2.jpg under processing
0291_3.jpg under processing
0291_4.jpg under pro

In [None]:
test_features

[{'zone_0': 0.0,
  'zone_1': 0.0,
  'zone_2': 0.0,
  'zone_3': 0.0,
  'zone_4': 0.0,
  'zone_5': 0.0,
  'zone_6': 0.0,
  'zone_7': 0.0,
  'zone_8': 0.0,
  'zone_9': 0.0,
  'zone_10': 0.0,
  'zone_11': 0.0,
  'zone_12': 0.0,
  'zone_13': 0.0,
  'zone_14': 0.0,
  'zone_15': 0.0,
  'glcm_contrast': 291.3957873774509,
  'glcm_correlation': 0.45619143936450157,
  'glcm_homogeneity': 0.8934455258804828,
  'glcm_energy': 0.8824117552326871,
  'hog_0': 0.0,
  'hog_1': 0.0,
  'hog_2': 0.0,
  'hog_3': 0.0,
  'hog_4': 0.0,
  'hog_5': 0.0,
  'hog_6': 0.0,
  'hog_7': 0.0,
  'hog_8': 0.0,
  'hog_9': 0.0,
  'hog_10': 0.0,
  'hog_11': 0.0,
  'hog_12': 0.0,
  'hog_13': 0.0,
  'hog_14': 0.0,
  'hog_15': 0.0,
  'hog_16': 0.0,
  'hog_17': 0.0,
  'hog_18': 0.0,
  'hog_19': 0.0},
 {'zone_0': 0.0,
  'zone_1': 0.0,
  'zone_2': 0.0,
  'zone_3': 0.0,
  'zone_4': 0.0,
  'zone_5': 0.0,
  'zone_6': 0.0,
  'zone_7': 0.0,
  'zone_8': 0.0,
  'zone_9': 0.0,
  'zone_10': 0.0,
  'zone_11': 0.0,
  'zone_12': 0.0,
  'zone

In [None]:
# Flatten each dictionary to a numeric list
features_array = np.array([list(f.values()) for f in test_features])

# Convert to DataFrame for easier handling
test_features_df = pd.DataFrame(features_array)

# Handle missing values (if any) - Impute with mean
test_features_df.fillna(features_df.mean(), inplace=True)

(test_features_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
770,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
767    0.0
768    0.0
769    0.0
770    0.0
771    0.0
Name: 0, Length: 772, dtype: float64


In [None]:
# Make predictions on the test set
y_pred = model.predict(test_features_df)
# Evaluate the model
accuracy = accuracy_score(test_outputs, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(test_outputs, y_pred))
print(confusion_matrix(test_outputs, y_pred))

Accuracy: 0.5751295336787565
              precision    recall  f1-score   support

           0       0.64      0.61      0.62       444
           1       0.50      0.53      0.51       328

    accuracy                           0.58       772
   macro avg       0.57      0.57      0.57       772
weighted avg       0.58      0.58      0.58       772

[[270 174]
 [154 174]]


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Assuming features_df is your DataFrame
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features_df)

# Convert back to DataFrame if needed
normalized_features_df = pd.DataFrame(normalized_features, columns=features_df.columns)

normalized_features_df

Unnamed: 0,zone_0,zone_1,zone_2,zone_3,zone_4,zone_5,zone_6,zone_7,zone_8,zone_9,...,hog_10,hog_11,hog_12,hog_13,hog_14,hog_15,hog_16,hog_17,hog_18,hog_19
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.717144,0.504336,0.397113,0.604116
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000
1126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000


In [None]:
from sklearn.svm import SVC

# Assuming 'features' and 'outputs' are already defined and loaded as in your previous code
# ... (Your existing code for feature extraction and data loading) ...

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', C=1)  # You can adjust the kernel and C parameter
svm_model.fit(normalized_features_df, outputs)

# ... (Your existing code for testing data preparation) ...




  y = column_or_1d(y, warn=True)


In [None]:
# Make predictions using the SVM model

svm_y_pred = svm_model.predict(test_features_df)
# Evaluate the SVM model
svm_accuracy = accuracy_score(test_outputs, svm_y_pred)
print(f"SVM Accuracy: {svm_accuracy}")
print(classification_report(test_outputs, svm_y_pred))
print(confusion_matrix(test_outputs, svm_y_pred))

SVM Accuracy: 0.5751295336787565
              precision    recall  f1-score   support

           0       0.58      1.00      0.73       444
           1       0.00      0.00      0.00       328

    accuracy                           0.58       772
   macro avg       0.29      0.50      0.37       772
weighted avg       0.33      0.58      0.42       772

[[444   0]
 [328   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# prompt: train a random forest model

from sklearn.ensemble import RandomForestClassifier

# Assuming 'features' and 'outputs' are already defined and loaded as in your previous code
# ... (Your existing code for feature extraction and data loading) ...

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators
rf_model.fit(features, outputs)

# ... (Your existing code for testing data preparation) ...

# Make predictions using the Random Forest model
rf_y_pred = rf_model.predict(test_features_df)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(test_outputs, rf_y_pred)
print(f"Random Forest Accuracy: {rf_accuracy}")
print(classification_report(test_outputs, rf_y_pred))
print(confusion_matrix(test_outputs, rf_y_pred))

  return fit_method(estimator, *args, **kwargs)


Random Forest Accuracy: 0.5401554404145078
              precision    recall  f1-score   support

           0       0.61      0.55      0.58       444
           1       0.46      0.53      0.50       328

    accuracy                           0.54       772
   macro avg       0.54      0.54      0.54       772
weighted avg       0.55      0.54      0.54       772

[[242 202]
 [153 175]]


#SECOND APPROACH



In [20]:
images = os.listdir('/content/Images/train')
img_size = (600, 600)

In [19]:
# loading the resnet model
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
# freezing the model
base_model.trainabale = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [21]:
base_model.summary()

In [34]:
# intializing lists to store features, filenames and labels
features = []
filenames = []
outputs = []

In [23]:
# loading all the labels
train_labels = pd.read_csv('/content/drive/MyDrive/train_answers.csv')
test_labels = pd.read_csv('/content/drive/MyDrive/test_answers.csv')

In [24]:
test_labels = test_labels.drop(columns='Usage', axis=0)

In [25]:
images = sorted(images)

In [26]:
len(train_labels)

282

In [27]:
images[0].split("_")[0]

'0001'

In [35]:
import cv2
images_dir = '/content/Images/train'
for image in images:
    writer_id = int(image.split("_")[0])
    print(f"{image} under processing")
    # loading the image
    image_path = os.path.join(images_dir, image)
    img = load_img(image_path, target_size=img_size)
    img_array = img_to_array(img)

    # Convert image to grayscale using OpenCV
    img_gray = cv2.cvtColor(img_array.astype(np.uint8), cv2.COLOR_BGR2RGB)

    # Resizing the image if needed
    img_gray = cv2.resize(img_gray, (img_size[0], img_size[1]))

    # Preprocessing for ResNet50
    img_array = np.expand_dims(img_gray, axis=0)
    img_array = preprocess_input(img_array)

    # Extract features
    feature = base_model.predict(img_array)
    features.append(feature[0])
    outputs.append(get_male_values(writer_id))


features = np.array(features)
outputs = np.array(outputs)

print(features)

0001_1.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0001_2.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0001_3.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0001_4.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
0002_1.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0002_2.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0002_3.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0002_4.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
0003_1.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
0003_2.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


In [36]:
print(len(outputs))

1128


In [None]:
features = features[:-1]

In [37]:
len(features)

1128

In [38]:
np.save('/content/plain_features.npy', features)
np.save('/content/plain_labels.npy', outputs)

In [39]:
print(len(features[0]))
print(len(features))

2048
1128


In [41]:
# performing feature selection
# correlation analysis
df = pd.DataFrame(features)
corr_matrix = df.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.6)]

features_selected = df.drop(columns=to_drop).values
print('Features dropped: ', len(to_drop))

Features dropped:  1000


In [42]:
# training SVM
svm_model = SVC(kernel='linear', C=1.0, random_state=42, probability=True)
svm_model.fit(features, outputs)

  y = column_or_1d(y, warn=True)


In [48]:
# model after feature selection
svm_selected = SVC(kernel='linear', C=1.0, random_state=42, probability=True)
svm_selected.fit(features_selected, outputs)

  y = column_or_1d(y, warn=True)


In [43]:
images_dir = '/content/Images/test'
test_features = []
test_outputs = []
images = os.listdir('/content/Images/test')
images = sorted(images)
for image in images:
    writer_id = int(image.split("_")[0])
    print(f"{image} under processing")
    # loading the image
    image_path = os.path.join(images_dir, image)
    img = load_img(image_path, target_size=img_size)
    img_array = img_to_array(img)

    # Convert image to grayscale using OpenCV
    img_gray = cv2.cvtColor(img_array.astype(np.uint8), cv2.COLOR_BGR2RGB)

    # Resizing the image if needed
    img_gray = cv2.resize(img_gray, (img_size[0], img_size[1]))

    # Preprocessing for ResNet50
    img_array = np.expand_dims(img_gray, axis=0)
    img_array = preprocess_input(img_array)

    # Extract features
    feature = base_model.predict(img_array)
    test_features.append(feature[0])
    test_outputs.append(get_male_values(writer_id,train=False))


test_features = np.array(test_features)
test_outputs = np.array(test_outputs)

print(test_features)




0283_1.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
0283_2.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
0283_3.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
0283_4.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0284_1.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
0284_2.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0284_3.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
0284_4.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
0285_1.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
0285_2.jpg under processing
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


In [45]:
np.save('/content/plain_features_test.npy', test_features)
np.save('/content/plain_labels_test.npy', test_outputs)

In [49]:
test_features = np.load('/content/plain_features_test.npy',allow_pickle=True)
test_features

array([[8.16547424e-02, 6.85949251e-03, 2.23157462e-03, ...,
        5.24438977e-01, 0.00000000e+00, 1.19592965e-01],
       [2.23820861e-02, 2.14735419e-02, 3.82149155e-04, ...,
        5.60707211e-01, 1.43019413e-03, 1.69452399e-01],
       [1.92058869e-02, 1.20440004e-02, 6.36669248e-03, ...,
        5.32508135e-01, 0.00000000e+00, 3.42501312e-01],
       ...,
       [3.90583556e-03, 2.50236597e-03, 1.73703842e-02, ...,
        3.32972169e-01, 0.00000000e+00, 1.06839173e-01],
       [5.48639931e-02, 3.32510769e-02, 1.64727978e-02, ...,
        5.58139861e-01, 5.85382544e-02, 1.74689189e-01],
       [8.97360872e-03, 0.00000000e+00, 1.64927042e-03, ...,
        3.90159607e-01, 1.16002643e-02, 9.87177864e-02]], dtype=float32)

In [50]:
# applying same correlation analysis on test features
df = pd.DataFrame(test_features)
test_features_selected = df.drop(columns=to_drop).values

In [51]:
# making predictions
svm_y_pred = svm_model.predict(test_features)

accuracy = accuracy_score(test_outputs, svm_y_pred)
print(f'Accuracy of SVM: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, svm_y_pred))

cm = confusion_matrix(test_outputs, svm_y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of SVM: 0.6878238341968912
Classification Report: 
              precision    recall  f1-score   support

           0       0.75      0.68      0.72       444
           1       0.62      0.69      0.65       328

    accuracy                           0.69       772
   macro avg       0.68      0.69      0.68       772
weighted avg       0.69      0.69      0.69       772

Confuson Matrix: 
[[304 140]
 [101 227]]


In [52]:
svm_selected_y_pred = svm_selected.predict(test_features_selected)

accuracy = accuracy_score(test_outputs, svm_selected_y_pred)
print(f'Accuracy of SVM: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, svm_selected_y_pred))

cm = confusion_matrix(test_outputs, svm_selected_y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of SVM: 0.7020725388601037
Classification Report: 
              precision    recall  f1-score   support

           0       0.76      0.70      0.73       444
           1       0.63      0.71      0.67       328

    accuracy                           0.70       772
   macro avg       0.70      0.70      0.70       772
weighted avg       0.71      0.70      0.70       772

Confuson Matrix: 
[[310 134]
 [ 96 232]]


In [53]:
joblib.dump(svm_model, '/content/svm_model.pkl')

['/content/svm_model.pkl']

In [54]:
joblib.dump(svm_selected, '/content/svm_selected_model.pkl')

['/content/svm_selected_model.pkl']

In [57]:
log_model = LogisticRegression(max_iter=5000)
log_model.fit(features, outputs)

  y = column_or_1d(y, warn=True)


In [58]:
#selected features only
log_model_selected = LogisticRegression(max_iter=5000)
log_model_selected.fit(features_selected, outputs)

  y = column_or_1d(y, warn=True)


In [59]:
# making predictions
log_y_pred = log_model.predict(test_features)

accuracy = accuracy_score(test_outputs, log_y_pred)
print(f'Accuracy of Logisitic Regression: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, log_y_pred))

cm = confusion_matrix(test_outputs, log_y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of Logisitic Regression: 0.6981865284974094
Classification Report: 
              precision    recall  f1-score   support

           0       0.76      0.70      0.73       444
           1       0.63      0.70      0.66       328

    accuracy                           0.70       772
   macro avg       0.69      0.70      0.69       772
weighted avg       0.70      0.70      0.70       772

Confuson Matrix: 
[[310 134]
 [ 99 229]]


In [61]:
log_selected_y_pred = log_model_selected.predict(test_features_selected)

accuracy = accuracy_score(test_outputs, log_selected_y_pred)
print(f'Accuracy of Logisitic Regression: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, log_selected_y_pred))

cm = confusion_matrix(test_outputs, log_selected_y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of Logisitic Regression: 0.694300518134715
Classification Report: 
              precision    recall  f1-score   support

           0       0.76      0.69      0.72       444
           1       0.62      0.70      0.66       328

    accuracy                           0.69       772
   macro avg       0.69      0.70      0.69       772
weighted avg       0.70      0.69      0.70       772

Confuson Matrix: 
[[305 139]
 [ 97 231]]


In [62]:
joblib.dump(log_model, '/content/log_model.pkl')

['/content/log_model.pkl']

In [63]:
joblib.dump(log_model_selected, '/content/log_model_selected.pkl')

['/content/log_model_selected.pkl']

In [64]:
# saving the base_model
joblib.dump(base_model, '/content/base_model.pkl')

['/content/base_model.pkl']

In [65]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(features,outputs)

  return fit_method(estimator, *args, **kwargs)


In [66]:
# making predictions
rf_y_pred = rf_model.predict(test_features)

accuracy = accuracy_score(test_outputs, rf_y_pred)
print(f'Accuracy of Random Forest: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, rf_y_pred))

cm = confusion_matrix(test_outputs, rf_y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of Random Forest: 0.6593264248704663
Classification Report: 
              precision    recall  f1-score   support

           0       0.73      0.65      0.69       444
           1       0.59      0.67      0.63       328

    accuracy                           0.66       772
   macro avg       0.66      0.66      0.66       772
weighted avg       0.67      0.66      0.66       772

Confuson Matrix: 
[[288 156]
 [107 221]]


In [67]:
rf_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_selected.fit(features_selected, outputs)

  return fit_method(estimator, *args, **kwargs)


In [69]:
rf_selected_y_pred = rf_selected.predict(test_features_selected)

accuracy = accuracy_score(test_outputs, rf_selected_y_pred)
print(f'Accuracy of Random Forest: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, rf_selected_y_pred))

cm = confusion_matrix(test_outputs, rf_selected_y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of Random Forest: 0.6735751295336787
Classification Report: 
              precision    recall  f1-score   support

           0       0.73      0.69      0.71       444
           1       0.61      0.65      0.63       328

    accuracy                           0.67       772
   macro avg       0.67      0.67      0.67       772
weighted avg       0.68      0.67      0.67       772

Confuson Matrix: 
[[308 136]
 [116 212]]


In [70]:
#saving the random forest model
joblib.dump(rf_model,'/content/rf_model.pkl')

['/content/rf_model.pkl']

In [71]:
joblib.dump(rf_selected,'/content/rf_selected.pkl')

['/content/rf_selected.pkl']

In [72]:
# calculating logloss
svm_preds_probs = svm_model.predict_proba(test_features)
log_preds_probs = log_model.predict_proba(test_features)
rf_preds_probs = rf_model.predict_proba(test_features)

In [79]:
svm_log_loss = log_loss(test_outputs, svm_preds_probs)
log_log_loss = log_loss(test_outputs, log_preds_probs)
rf_log_loss = log_loss(test_outputs, rf_preds_probs)

In [80]:
# calculating logloss
print(f'SVM Log Loss: {svm_log_loss}')
print(f'Logistic Regression Log Loss: {log_log_loss}')
print(f'Random Forest Log Loss: {rf_log_loss}')

SVM Log Loss: 0.6075907387452738
Logistic Regression Log Loss: 0.6369934739138804
Random Forest Log Loss: 0.6251529849292803


#### Log Loss after feature selection

In [77]:
svm_sel_preds_probs = svm_selected.predict_proba(test_features_selected)
log_sel_preds_probs = log_model_selected.predict_proba(test_features_selected)
rf_sel_preds_probs = rf_selected.predict_proba(test_features_selected)

In [82]:
svm_sel_log_loss = log_loss(test_outputs, svm_sel_preds_probs)
log_sel_log_loss = log_loss(test_outputs, log_sel_preds_probs)
rf_sel_log_loss = log_loss(test_outputs, rf_sel_preds_probs)

In [83]:
print(f'SVM Log Loss: {svm_sel_log_loss}')
print(f'Logistic Regression Log Loss: {log_sel_log_loss}')
print(f'Random Forest Log Loss: {rf_sel_log_loss}')

SVM Log Loss: 0.5969000259539035
Logistic Regression Log Loss: 0.5975528637224192
Random Forest Log Loss: 0.6275682004460371


### Ensemble techniques: Stacking, Blending

In [86]:
# applying stacking ensemble technique
svm_preds_train = svm_model.predict_proba(features)[:, 1]
log_preds_train = log_model.predict_proba(features)[:, 1]
rf_preds_train = rf_model.predict_proba(features)[:, 1]

stacked_features_train = np.column_stack((svm_preds_train, log_preds_train, rf_preds_train))

In [87]:
# traininh logistic regression as meta model to make predictions
meta_model = LogisticRegression()
meta_model.fit(stacked_features_train, outputs)

  y = column_or_1d(y, warn=True)


In [90]:
svm_preds_test = svm_model.predict_proba(test_features)[:, 1]
log_preds_test = log_model.predict_proba(test_features)[:, 1]
rf_preds_test = rf_model.predict_proba(test_features)[:, 1]

stacked_features_test = np.column_stack((svm_preds_test, log_preds_test, rf_preds_test))

final_predictions = meta_model.predict(stacked_features_test)
final_preds_probs = meta_model.predict_proba(stacked_features_test)

In [91]:
meta_model_log_loss = log_loss(test_outputs, final_preds_probs)
print(f'Meta Model Log Loss: {meta_model_log_loss}')

Meta Model Log Loss: 0.7082354530864996


In [92]:
# evaluating the stacked model
print("Accuracy: ", accuracy_score(test_outputs, final_predictions))
print("Classification Report: \n", classification_report(test_outputs, final_predictions))
print("Confusion Matrix: \n", confusion_matrix(test_outputs, final_predictions))

Accuracy:  0.6968911917098446
Classification Report: 
               precision    recall  f1-score   support

           0       0.77      0.68      0.72       444
           1       0.62      0.72      0.67       328

    accuracy                           0.70       772
   macro avg       0.70      0.70      0.69       772
weighted avg       0.71      0.70      0.70       772

Confusion Matrix: 
 [[303 141]
 [ 93 235]]


In [93]:
# applying the blending ensemble technique
X_train_main, X_blend, y_train_main, y_blend = train_test_split(features, outputs, test_size=0.2, random_state=42)


In [94]:
svm_model.fit(X_train_main, y_train_main)
log_model.fit(X_train_main, y_train_main)
rf_model.fit(X_train_main, y_train_main)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return fit_method(estimator, *args, **kwargs)


In [95]:
svm_preds_blend = svm_model.predict_proba(X_blend)[:, 1]
log_preds_blend = log_model.predict_proba(X_blend)[:, 1]
rf_preds_blend = rf_model.predict_proba(X_blend)[:, 1]

stacked_features_blend = np.column_stack((svm_preds_blend, log_preds_blend, rf_preds_blend))

In [96]:
# training the meta model
meta_model = LogisticRegression()
meta_model.fit(stacked_features_blend, y_blend)

  y = column_or_1d(y, warn=True)


In [97]:
svm_preds_test = svm_model.predict_proba(test_features)[:, 1]
rf_preds_test = rf_model.predict_proba(test_features)[:, 1]
lr_preds_test = log_model.predict_proba(test_features)[:, 1]

# Stack the predictions as features for the meta-model
stacked_features_test = np.column_stack((svm_preds_test, rf_preds_test, lr_preds_test))

# Make the final prediction using the meta-model
final_predictions = meta_model.predict(stacked_features_test)
final_preds_probs = meta_model.predict_proba(stacked_features_test)

In [98]:
meta_model_log_loss = log_loss(test_outputs, final_preds_probs)
print(f'Meta Model Log Loss: {meta_model_log_loss}')

Meta Model Log Loss: 0.5943277355086987


In [99]:
# Evaluate the final predictions
print("Accuracy:", accuracy_score(test_outputs, final_predictions))
print("Classification Report:\n", classification_report(test_outputs, final_predictions))
print("Confusion Matrix:\n", confusion_matrix(test_outputs, final_predictions))


Accuracy: 0.667098445595855
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.64      0.69       444
           1       0.59      0.71      0.64       328

    accuracy                           0.67       772
   macro avg       0.67      0.67      0.67       772
weighted avg       0.68      0.67      0.67       772

Confusion Matrix:
 [[283 161]
 [ 96 232]]


In [104]:
# trying random forest as meta model
meta_model = RandomForestClassifier(n_estimators=100, random_state=42)
meta_model.fit(stacked_features_blend, y_blend)

  return fit_method(estimator, *args, **kwargs)


In [105]:
final_predictions = meta_model.predict(stacked_features_test)
final_preds_probs = meta_model.predict_proba(stacked_features_test)

In [106]:
meta_model_log_loss = log_loss(test_outputs, final_preds_probs)
print(f'Meta Model Log Loss: {meta_model_log_loss}')

Meta Model Log Loss: 0.6739611363835442


## Clustering of Features

In [107]:
from sklearn.cluster import KMeans
import numpy as np

# Load your features (assuming they are stored in a NumPy array)
features = np.load('/content/plain_features.npy', allow_pickle=True)
outputs = np.load('/content/plain_labels.npy', allow_pickle=True)


In [108]:
type(features)

numpy.ndarray

In [109]:
print(len(features))
print(len(features[0]))

1128
2048


In [110]:
features_df = pd.DataFrame(features)
kmeans = KMeans(n_clusters=2, random_state=0) # Example with 2 clusters

# Fit the KMeans model to your features and get cluster labels
cluster_labels = kmeans.fit_predict(features)

# Now, add the cluster labels as a new feature to your feature dataframe
features_df['cluster_label'] = cluster_labels

features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,cluster_label
0,0.055867,0.004403,0.000000,0.0,0.000415,0.171867,0.121604,0.276430,0.004079,0.000000,...,0.0,0.010255,0.000000,0.000000,0.000000,0.000000,0.637357,0.000000,0.091797,1
1,0.006359,0.000310,0.001382,0.0,0.007300,0.032589,0.099136,0.143320,0.014443,0.008430,...,0.0,0.021771,0.000000,0.011099,0.000000,0.000000,0.690614,0.000000,0.170153,0
2,0.028010,0.015728,0.000448,0.0,0.008772,0.021521,0.060416,0.387198,0.000000,0.000769,...,0.0,0.002817,0.002135,0.011431,0.000000,0.001391,0.511592,0.009306,0.123999,1
3,0.000000,0.005009,0.001263,0.0,0.000622,0.019464,0.026923,0.558397,0.000000,0.000000,...,0.0,0.000000,0.000000,0.023221,0.000000,0.000000,0.217898,0.000000,0.077009,0
4,0.115535,0.003434,0.007155,0.0,0.005720,0.068678,0.090083,0.201725,0.000000,0.062678,...,0.0,0.015577,0.000000,0.000000,0.000000,0.000596,1.211246,0.000000,0.056072,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,0.040205,0.007915,0.007356,0.0,0.007344,0.001901,0.030719,0.578499,0.000000,0.000000,...,0.0,0.000000,0.000000,0.033619,0.000000,0.000000,0.135101,0.012461,0.299129,0
1124,0.035803,0.012886,0.001337,0.0,0.042935,0.055422,0.068202,0.153038,0.000000,0.095449,...,0.0,0.020240,0.004312,0.002943,0.000000,0.000000,0.511693,0.000000,0.099855,1
1125,0.027943,0.020778,0.006132,0.0,0.001427,0.050048,0.020232,0.227269,0.000000,0.001566,...,0.0,0.015078,0.006867,0.011445,0.000000,0.000000,0.518708,0.000000,0.107807,0
1126,0.014029,0.007981,0.007266,0.0,0.000634,0.012550,0.087590,0.240112,0.000000,0.004502,...,0.0,0.020814,0.000000,0.000518,0.000000,0.000000,0.727057,0.000000,0.082672,1


In [111]:
features_df.columns = features_df.columns.astype(str)
log_model = LogisticRegression(max_iter=5000)
log_model.fit(features_df, outputs)

  y = column_or_1d(y, warn=True)


In [112]:
kmeans.fit_predict(test_features)
test_labels = kmeans.labels_
test_labels

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,

In [113]:
len(test_labels)

772

In [114]:
len(test_features)

772

In [115]:
type(test_labels)

numpy.ndarray

In [116]:
type(test_features)

numpy.ndarray

In [117]:
test_labels[0]

1

In [118]:
test_features_df = pd.DataFrame(test_features)
kmeans = KMeans(n_clusters=2, random_state=0) # Example with 2 clusters

# Fit the KMeans model to your features and get cluster labels
cluster_labels = kmeans.fit_predict(test_features)

# Now, add the cluster labels as a new feature to your feature dataframe
test_features_df['cluster_label'] = cluster_labels

test_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,cluster_label
0,0.081655,0.006859,0.002232,0.000000,0.027540,0.149416,0.037099,0.207096,0.000000,0.007198,...,0.000000,0.049882,0.007751,0.001284,0.00000,0.000000,0.524439,0.000000,0.119593,1
1,0.022382,0.021474,0.000382,0.000000,0.006245,0.335390,0.029722,0.224612,0.000000,0.000814,...,0.000000,0.002509,0.013145,0.006740,0.00000,0.000000,0.560707,0.001430,0.169452,0
2,0.019206,0.012044,0.006367,0.000000,0.024909,0.070121,0.066268,0.392701,0.000000,0.012054,...,0.000000,0.006177,0.000000,0.042060,0.00000,0.000000,0.532508,0.000000,0.342501,1
3,0.121320,0.031846,0.015275,0.000000,0.002273,0.068873,0.021166,0.760909,0.000000,0.000000,...,0.000000,0.014176,0.000000,0.014404,0.00000,0.000000,0.371635,0.000000,0.419501,0
4,0.042180,0.006506,0.032357,0.000000,0.020295,0.001907,0.117722,0.425115,0.000000,0.047931,...,0.000000,0.042746,0.000000,0.005007,0.00000,0.000000,0.483473,0.000000,0.072512,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,0.016037,0.005745,0.000000,0.000000,0.005236,0.010989,0.028134,0.425591,0.000000,0.002543,...,0.000000,0.000087,0.000000,0.041129,0.00000,0.000000,0.168449,0.000000,0.105704,0
768,0.007176,0.033355,0.065767,0.000000,0.014481,0.046995,0.092484,0.263897,0.022950,0.014531,...,0.001173,0.004560,0.000532,0.000000,0.00000,0.043839,0.532507,0.014703,0.090328,1
769,0.003906,0.002502,0.017370,0.000000,0.003047,0.052243,0.091307,0.286514,0.000000,0.006906,...,0.000000,0.038382,0.000000,0.018863,0.00000,0.000000,0.332972,0.000000,0.106839,0
770,0.054864,0.033251,0.016473,0.000344,0.002823,0.000000,0.149029,0.502561,0.017719,0.007216,...,0.000000,0.019318,0.015477,0.111106,0.00516,0.020614,0.558140,0.058538,0.174689,1


In [119]:
test_features_df.columns = test_features_df.columns.astype(str)
y_pred = log_model.predict(test_features_df)
test_outputs = np.load('/content/plain_labels_test.npy')
accuracy = accuracy_score(test_outputs, y_pred)
print(f'Accuracy of Logisitic Regression: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, y_pred))

cm = confusion_matrix(test_outputs, y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of Logisitic Regression: 0.6955958549222798
Classification Report: 
              precision    recall  f1-score   support

           0       0.76      0.70      0.72       444
           1       0.63      0.70      0.66       328

    accuracy                           0.70       772
   macro avg       0.69      0.70      0.69       772
weighted avg       0.70      0.70      0.70       772

Confuson Matrix: 
[[309 135]
 [100 228]]


#Clustering

In [120]:
features = np.load('/content/plain_features.npy', allow_pickle=True)
outputs = np.load('/content/plain_labels.npy', allow_pickle=True)
features_df = pd.DataFrame(features)
kmeans = KMeans(n_clusters=2, random_state=0) # Example with 2 clusters

# Fit the KMeans model to your features and get cluster labels
cluster_labels = kmeans.fit_predict(features)

# Now, add the cluster labels as a new feature to your feature dataframe
features_df['cluster_label'] = cluster_labels

features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,cluster_label
0,0.055867,0.004403,0.000000,0.0,0.000415,0.171867,0.121604,0.276430,0.004079,0.000000,...,0.0,0.010255,0.000000,0.000000,0.000000,0.000000,0.637357,0.000000,0.091797,1
1,0.006359,0.000310,0.001382,0.0,0.007300,0.032589,0.099136,0.143320,0.014443,0.008430,...,0.0,0.021771,0.000000,0.011099,0.000000,0.000000,0.690614,0.000000,0.170153,0
2,0.028010,0.015728,0.000448,0.0,0.008772,0.021521,0.060416,0.387198,0.000000,0.000769,...,0.0,0.002817,0.002135,0.011431,0.000000,0.001391,0.511592,0.009306,0.123999,1
3,0.000000,0.005009,0.001263,0.0,0.000622,0.019464,0.026923,0.558397,0.000000,0.000000,...,0.0,0.000000,0.000000,0.023221,0.000000,0.000000,0.217898,0.000000,0.077009,0
4,0.115535,0.003434,0.007155,0.0,0.005720,0.068678,0.090083,0.201725,0.000000,0.062678,...,0.0,0.015577,0.000000,0.000000,0.000000,0.000596,1.211246,0.000000,0.056072,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,0.040205,0.007915,0.007356,0.0,0.007344,0.001901,0.030719,0.578499,0.000000,0.000000,...,0.0,0.000000,0.000000,0.033619,0.000000,0.000000,0.135101,0.012461,0.299129,0
1124,0.035803,0.012886,0.001337,0.0,0.042935,0.055422,0.068202,0.153038,0.000000,0.095449,...,0.0,0.020240,0.004312,0.002943,0.000000,0.000000,0.511693,0.000000,0.099855,1
1125,0.027943,0.020778,0.006132,0.0,0.001427,0.050048,0.020232,0.227269,0.000000,0.001566,...,0.0,0.015078,0.006867,0.011445,0.000000,0.000000,0.518708,0.000000,0.107807,0
1126,0.014029,0.007981,0.007266,0.0,0.000634,0.012550,0.087590,0.240112,0.000000,0.004502,...,0.0,0.020814,0.000000,0.000518,0.000000,0.000000,0.727057,0.000000,0.082672,1


In [121]:
features_df.columns = features_df.columns.astype(str)
log_model = LogisticRegression(max_iter=5000)
log_model.fit(features_df, outputs)

  y = column_or_1d(y, warn=True)


In [122]:
test_features = np.load('/content/plain_features_test.npy', allow_pickle=True)
test_outputs = np.load('/content/plain_labels_test.npy', allow_pickle=True)

In [123]:
test_features_df = pd.DataFrame(test_features)

cluster_labels = kmeans.fit_predict(test_features)

# Now, add the cluster labels as a new feature to your feature dataframe
test_features_df['cluster_label'] = cluster_labels

test_features_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,cluster_label
0,0.081655,0.006859,0.002232,0.000000,0.027540,0.149416,0.037099,0.207096,0.000000,0.007198,...,0.000000,0.049882,0.007751,0.001284,0.00000,0.000000,0.524439,0.000000,0.119593,1
1,0.022382,0.021474,0.000382,0.000000,0.006245,0.335390,0.029722,0.224612,0.000000,0.000814,...,0.000000,0.002509,0.013145,0.006740,0.00000,0.000000,0.560707,0.001430,0.169452,0
2,0.019206,0.012044,0.006367,0.000000,0.024909,0.070121,0.066268,0.392701,0.000000,0.012054,...,0.000000,0.006177,0.000000,0.042060,0.00000,0.000000,0.532508,0.000000,0.342501,1
3,0.121320,0.031846,0.015275,0.000000,0.002273,0.068873,0.021166,0.760909,0.000000,0.000000,...,0.000000,0.014176,0.000000,0.014404,0.00000,0.000000,0.371635,0.000000,0.419501,0
4,0.042180,0.006506,0.032357,0.000000,0.020295,0.001907,0.117722,0.425115,0.000000,0.047931,...,0.000000,0.042746,0.000000,0.005007,0.00000,0.000000,0.483473,0.000000,0.072512,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
767,0.016037,0.005745,0.000000,0.000000,0.005236,0.010989,0.028134,0.425591,0.000000,0.002543,...,0.000000,0.000087,0.000000,0.041129,0.00000,0.000000,0.168449,0.000000,0.105704,0
768,0.007176,0.033355,0.065767,0.000000,0.014481,0.046995,0.092484,0.263897,0.022950,0.014531,...,0.001173,0.004560,0.000532,0.000000,0.00000,0.043839,0.532507,0.014703,0.090328,1
769,0.003906,0.002502,0.017370,0.000000,0.003047,0.052243,0.091307,0.286514,0.000000,0.006906,...,0.000000,0.038382,0.000000,0.018863,0.00000,0.000000,0.332972,0.000000,0.106839,0
770,0.054864,0.033251,0.016473,0.000344,0.002823,0.000000,0.149029,0.502561,0.017719,0.007216,...,0.000000,0.019318,0.015477,0.111106,0.00516,0.020614,0.558140,0.058538,0.174689,1


In [124]:
# making predictions
test_features_df.columns = test_features_df.columns.astype(str)
y_pred = log_model.predict(test_features_df)
test_outputs = np.load('/content/plain_labels_test.npy')
accuracy = accuracy_score(test_outputs, y_pred)
print(f'Accuracy of Logisitic Regression: {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, y_pred))

cm = confusion_matrix(test_outputs, y_pred)
print('Confuson Matrix: ')
print(cm)

Accuracy of Logisitic Regression: 0.6955958549222798
Classification Report: 
              precision    recall  f1-score   support

           0       0.76      0.70      0.72       444
           1       0.63      0.70      0.66       328

    accuracy                           0.70       772
   macro avg       0.69      0.70      0.69       772
weighted avg       0.70      0.70      0.70       772

Confuson Matrix: 
[[309 135]
 [100 228]]


In [None]:
# prompt: apply hyper parameter tuning on the log_
# model

from sklearn.model_selection import GridSearchCV
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] # Optimization algorithm
}

# Create a Logistic Regression model
log_model = LogisticRegression(max_iter=5000)


# Create GridSearchCV object
grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy') # 5-fold cross-validation


# Fit the grid search to the data
grid_search.fit(features_df, outputs)

# Get the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Train the model with the best hyperparameters
best_log_model = grid_search.best_estimator_


# Make predictions on the test set using the best model
y_pred = best_log_model.predict(test_features_df)

# Evaluate the model
accuracy = accuracy_score(test_outputs, y_pred)
print(f'Accuracy of Logistic Regression (Tuned): {accuracy}')

print('Classification Report: ')
print(classification_report(test_outputs, y_pred))

cm = confusion_matrix(test_outputs, y_pred)
print('Confusion Matrix: ')
cm

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu