In [4]:
!git clone 

fatal: destination path 'CFIT' already exists and is not an empty directory.


In [5]:
import os
os.chdir('CFIT')

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GroupKFold, StratifiedGroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical


In [None]:
import pandas as pd
import re

# Function to detect delimiter by checking the most frequent separator
def detect_delimiter(filepath, num_rows=10):
    with open(filepath, 'r') as file:
        # Read the first few lines of the file
        lines = [file.readline() for _ in range(num_rows)]

    # Create a dictionary to store the frequency of each separator
    separator_freq = {}
    for line in lines:
        for sep in ['\t', ',', ';', ':', '|', ' ']:  # Common separators
            count = line.count(sep)
            if count > 0:
                separator_freq[sep] = separator_freq.get(sep, 0) + count

    # Find the separator with the highest frequency
    most_frequent_sep = max(separator_freq, key=separator_freq.get, default=None)

    # If no common separator is found, check for repeated characters
    if most_frequent_sep is None:
        # Find repeated characters (potential delimiter)
        for line in lines:
            match = re.search(r"([^\w\s])\1+", line)
            if match:
                most_frequent_sep = match.group(1)
                break

    return most_frequent_sep

# Detect the delimiter
delimiter = detect_delimiter('Feature_Score_dat0DayWindow.txt')


# Load dataset using the detected delimiter
df = pd.read_table('Feature_Score_dat0DayWindow.txt', delimiter=delimiter)


In [25]:
#Impute missing features
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
X = df.iloc[:, 5:].values  # Assuming features start from column 6
X = imputer.fit_transform(X)

In [26]:
# Features and target

y = df["Score"].values
groups = df["CowId"].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [27]:


# Train-test split while respecting groups (CowId)
def group_split(X, y, groups, test_size=0.2, random_state=42):
    unique_groups = np.unique(groups)
    test_groups = np.random.choice(unique_groups, size=int(len(unique_groups) * test_size), replace=False)
    train_mask = ~np.isin(groups, test_groups)
    test_mask = np.isin(groups, test_groups)
    return X[train_mask], X[test_mask], y[train_mask], y[test_mask]

X_train, X_test, y_train, y_test = group_split(X_scaled, y, groups)

# Handle class imbalance with class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Models and hyperparameters
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "SVC": SVC(probability=True),
    "RandomForest": RandomForestClassifier(),
    "KNeighbors": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier()
}

params = {
    "LogisticRegression": {"C": [0.1, 1, 10]},
    "SVC": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]},
    "KNeighbors": {"n_neighbors": [3, 5, 7]},
    "DecisionTree": {"max_depth": [None, 10, 20]}
}

# Cross-validation with group-aware splits
cv = StratifiedGroupKFold(n_splits=5)

best_models = {}
for model_name, model in models.items():
    grid = GridSearchCV(
        model, params[model_name], scoring="accuracy", cv=cv, n_jobs=-1
    )
    grid.fit(X_train, y_train, groups=groups[:len(y_train)])
    best_models[model_name] = grid.best_estimator_
    print(f"Best params for {model_name}: {grid.best_params_}")
    print(f"CV Accuracy for {model_name}: {grid.best_score_}")





Best params for LogisticRegression: {'C': 0.1}
CV Accuracy for LogisticRegression: 0.5305597657710592




Best params for SVC: {'C': 0.1, 'kernel': 'rbf'}
CV Accuracy for SVC: 0.6528567517760804




Best params for RandomForest: {'max_depth': None, 'n_estimators': 100}
CV Accuracy for RandomForest: 0.6549132658511831




Best params for KNeighbors: {'n_neighbors': 7}
CV Accuracy for KNeighbors: 0.6270086622865587




Best params for DecisionTree: {'max_depth': 10}
CV Accuracy for DecisionTree: 0.5357624793013194


In [30]:
# Deep Learning model
# Convert labels to zero-based indexing
y_train = y_train - 1
y_test = y_test - 1
def build_nn(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(len(np.unique(y_train)), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

nn_model = build_nn(X_train.shape[1])
nn_model.fit(X_train, y_train, epochs=20, batch_size=32, class_weight=class_weights_dict, validation_split=0.2)



Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.3154 - loss: 2.1570 - val_accuracy: 0.1969 - val_loss: 1.7444
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3538 - loss: 1.4179 - val_accuracy: 0.3161 - val_loss: 1.9524
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3093 - loss: 1.3726 - val_accuracy: 0.2591 - val_loss: 1.5622
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4157 - loss: 1.0323 - val_accuracy: 0.3627 - val_loss: 1.4847
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4244 - loss: 1.0706 - val_accuracy: 0.3627 - val_loss: 1.3987
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4664 - loss: 0.9107 - val_accuracy: 0.3057 - val_loss: 1.5209
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7abd800af160>

In [31]:
# Evaluation
print("\nEvaluation on Test Data:")
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Report:")
    print(classification_report(y_test, y_pred))





Evaluation on Test Data:

LogisticRegression Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       155
           1       0.24      0.73      0.36        64
           2       0.05      0.12      0.07        17
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         3

    accuracy                           0.20       245
   macro avg       0.06      0.17      0.09       245
weighted avg       0.07      0.20      0.10       245



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



SVC Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       155
           1       0.26      1.00      0.41        64
           2       0.00      0.00      0.00        17
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         3

    accuracy                           0.26       245
   macro avg       0.05      0.20      0.08       245
weighted avg       0.07      0.26      0.11       245


RandomForest Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       155
           1       0.24      0.88      0.38        64
           2       0.00      0.00      0.00        17
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         3

    accuracy                           0.23       245
   macro avg       0.05      0.17      0.08       245
weighted avg       0.06      0.23      0.1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Deep learning evaluation
y_test_pred = np.argmax(nn_model.predict(X_test), axis=1)
print("\nDeep Learning Report:")
print(classification_report(y_test, y_test_pred))

In [35]:
# Step 1: Import Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt

# Step 2: Load and Split Dataset
# Replace this with your actual dataset loading logic
# Impute missing features
imputer = KNNImputer(n_neighbors=5)
X = df.iloc[:, 5:].values  # Assuming features start from column 6
X = imputer.fit_transform(X)
y = df["Score"].values
groups = df["CowId"].values

# Step 2.1: Check for NaN values after imputation
print("NaN values after imputation in X:", np.sum(np.isnan(X)))
print("NaN values in y:", np.sum(np.isnan(y)))

# Step 2.2: Re-map labels to start from 0 (important for XGBoost)
y = y - 1  # Assuming original labels are from 1 to 5, and remapping to 0-4

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Step 3: Scale the Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Address Class Imbalance with SMOTE
smote = SMOTE(random_state=42, k_neighbors=2)  # Use smaller k_neighbors
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Step 4.1: Check class distribution before and after SMOTE
print(f"Original class distribution: {np.bincount(y_train)}")
print(f"Resampled class distribution: {np.bincount(y_train_resampled)}")

# Step 5: Compute Class Weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

# Step 6: Initialize Model (XGBoost)
model = XGBClassifier(scale_pos_weight=class_weights_dict.get(1, 1), random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Step 7: Hyperparameter Tuning with Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Debug: Ensure no NaN values in features and labels before fitting model
print(f"Before fitting: X_train_resampled contains NaN: {np.any(np.isnan(X_train_resampled))}, y_train_resampled contains NaN: {np.any(np.isnan(y_train_resampled))}")

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1_weighted', cv=5, verbose=2, error_score='raise')
grid_search.fit(X_train_resampled, y_train_resampled)

# Best model after Grid Search
best_model = grid_search.best_estimator_

# Step 8: Evaluate Model on Test Data
y_pred = best_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 9: Cross-Validation for Robust Metrics
cv_scores = cross_val_score(best_model, X_train_resampled, y_train_resampled, cv=5, scoring='f1_weighted')
print(f"Cross-Validation F1-Score: {cv_scores.mean():.4f}")

# Step 10: Plot Class Distribution (Optional for Debugging)
# Check how class distribution looks after resampling
plt.figure(figsize=(8, 5))
plt.bar(np.unique(y_train_resampled), np.bincount(y_train_resampled))
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Class Distribution after SMOTE')
plt.show()


NaN values after imputation in X: 0
NaN values in y: 0
Original class distribution: [628 238  67  31   4]
Resampled class distribution: [628 628 628 628 628]
Before fitting: X_train_resampled contains NaN: False, y_train_resampled contains NaN: False
Fitting 5 folds for each of 27 candidates, totalling 135 fits


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=  20.3s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=  17.2s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=  18.3s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=  20.6s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=3, n_estimators=50; total time=  17.8s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  38.5s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  37.7s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  40.1s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  35.8s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  37.5s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=150; total time=  54.3s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=150; total time=  54.7s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=150; total time= 1.2min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=150; total time= 1.0min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=150; total time=  53.9s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=  49.5s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=  46.2s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=  50.3s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=  47.8s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=5, n_estimators=50; total time=  48.0s


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 1.5min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 1.6min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 1.6min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 1.6min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time= 1.6min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time= 2.4min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time= 2.3min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time= 2.3min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time= 2.4min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=150; total time= 2.3min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=7, n_estimators=50; total time= 1.5min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=7, n_estimators=50; total time= 1.5min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=7, n_estimators=50; total time= 1.5min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=7, n_estimators=50; total time= 1.5min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ...learning_rate=0.01, max_depth=7, n_estimators=50; total time= 1.4min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=100; total time= 2.9min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=100; total time= 2.9min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=100; total time= 3.0min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=100; total time= 3.1min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=100; total time= 3.0min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=150; total time= 4.3min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



[CV] END ..learning_rate=0.01, max_depth=7, n_estimators=150; total time= 4.4min


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



KeyboardInterrupt: 

In [34]:
print("Original y_train labels:", sorted(set(y_train)))
print("Resampled y_train_resampled labels:", sorted(set(y_train_resampled)))

Original y_train labels: [1, 2, 3, 4, 5]
Resampled y_train_resampled labels: [1, 2, 3, 4, 5]


cp: missing destination file operand after '/content/CFIT.ipynb/content/CFIT/'
Try 'cp --help' for more information.
