In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_predict




# 4-wayAdditive DATA


In [None]:
way_additive_df = pd.read_csv('4-wayAdditive100feat.txt',delimiter='\t')
way_additive_df.head()

In [None]:
way_additive_df.shape

In [None]:
way_additive_df.tail()

In [None]:
way_additive_df.info()

In [None]:
way_additive_df.describe()

In [None]:
way_additive_df.isnull().sum()

In [None]:
way_additive_df.dtypes

In [None]:

# Data distribution
plt.hist(way_additive_df["N0"])
plt.show()



In [None]:
# Categorical data
print(way_additive_df["Class"].value_counts())


In [None]:
# Feature relationships
plt.scatter(way_additive_df["M0P1"], way_additive_df["M1P2"])
plt.show()


In [None]:
# Outlier detection
plt.boxplot(way_additive_df["N0"])
plt.show()

In [None]:
plt.figure(figsize=(16, 20))
sns.set(style="whitegrid")
sns.boxplot(data=way_additive_df, orient="h", palette="Set2")
plt.title("Box Plots (or Violin Plots)")
plt.show()

In [None]:
plt.hist(way_additive_df["N52"])
plt.show()


In [None]:
plt.hist(way_additive_df["M2P3"])
plt.show()


In [None]:
X = way_additive_df.drop(columns=['Class'])  # Features
y = way_additive_df['Class']  # Target variable

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Display the shape of the resulting sets
print("Training set - X:", X_train.shape, " y:", y_train.shape)
print("Testing set - X:", X_test.shape, " y:", y_test.shape)

In [None]:
# list of models to experiment with
models = [
    ("Decision Tree", DecisionTreeClassifier(random_state=10)),
    ("Random Forest", RandomForestClassifier(random_state=10)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=10)),
    ("Logistic Regression", LogisticRegression(random_state=10)),
    ("Support Vector Machine", SVC(random_state=10)),
]

# Iterate through the models, train, and evaluate
for model_name, model in models:
    # Training
    model.fit(X_train, y_train)

    # Feature Importance Analysis
    if hasattr(model, 'feature_importances_'):
        # For tree-based models
        feature_importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        # For linear models (e.g., Logistic Regression)
        feature_importance = np.abs(model.coef_[0])
    else:
        # Model does not provide feature importance scores
        print(f"\n{model_name} does not provide feature importance scores.")
        continue

    # Create a DataFrame to display feature names and their importance scores
    feature_importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': feature_importance,
    })

    # Sort features by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Visualize Feature Importance
    plt.figure(figsize=(16, 20))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance Score')
    plt.title(f'{model_name} - Feature Importance')
    plt.show()

    # Select top features based on importance scores
    top_features = SelectFromModel(model, threshold='mean').fit(X_train, y_train).get_support()
    print(f"\nTop Features for {model_name}:\n", X_train.columns[top_features])

In [None]:
for model_name, model in models:
    # Training
    model.fit(X_train, y_train)

    # Cross-validation scores
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"\nModel: {model_name}")
    print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.4f} (+/- {cross_val_scores.std():.4f})")

    # Testing
    y_pred = model.predict(X_test)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Additional evaluation metrics (classification report)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

**Decision Tree**:
Cross-Validation Accuracy: 89.62%
Test Accuracy: 92.50%
Balanced performance with good precision, recall, and F1-score for both classes.

**Random Forest**:
Cross-Validation Accuracy: 95.00%
Test Accuracy: 97.50%
Improved accuracy compared to the Decision Tree. Good precision, recall, and F1-score for both classes.

**Gradient Boosting**:
Cross-Validation Accuracy: 94.25%
Test Accuracy: 97.50%
Similar performance to Random Forest. High accuracy and balanced precision, recall, and F1-score for both classes.

**Logistic Regression**:
Cross-Validation Accuracy: 92.50%
Test Accuracy: 96.50%
Good performance with high precision, recall, and F1-score for both classes.

**Support Vector Machine**:
Cross-Validation Accuracy: 93.88%
Test Accuracy: 98.00%
Highest accuracy with balanced precision, recall, and F1-score for both classes.

**General Observations**:
All models perform well with high accuracy on the test set.
Random Forest, Gradient Boosting, and Support Vector Machine show particularly high accuracy.
Precision, recall, and F1-scores are balanced for both classes, indicating good model performance.

# 2-wayEpi100feat DATA

In [None]:
wayEpi1_df = pd.read_csv('2-wayEpi100feat.txt',delimiter='\t')
wayEpi1_df.head()

In [None]:
wayEpi1_df.shape

In [None]:
wayEpi1_df.tail()

In [None]:
wayEpi1_df.info()

In [None]:
wayEpi1_df.isnull().sum()

In [None]:
wayEpi1_df.dtypes

In [None]:

# Data distribution
plt.hist(wayEpi1_df["N10"])
plt.show()

In [None]:
# Categorical data
print(wayEpi1_df["Class"].value_counts())


In [None]:
# Feature relationships
plt.scatter(wayEpi1_df["M0P1"], way_additive_df["M1P2"])
plt.show()


In [None]:
# Outlier detection
plt.boxplot(way_additive_df["N52"])
plt.show()

In [None]:
plt.figure(figsize=(16, 20))
sns.set(style="whitegrid")
sns.boxplot(data=wayEpi1_df, orient="h", palette="Set2")
plt.title("Box Plots (or Violin Plots)")
plt.show()

In [None]:
X = wayEpi1_df.drop(columns=['Class'])  # Features
y = wayEpi1_df['Class']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
models = [
    ("Decision Tree", DecisionTreeClassifier(random_state=10)),
    ("Random Forest", RandomForestClassifier(random_state=10)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=10)),
    ("Logistic Regression", LogisticRegression(random_state=10)),
    ("Support Vector Machine", SVC(random_state=10)),
]


for model_name, model in models:
    model.fit(X_train, y_train)

    if hasattr(model, 'feature_importances_'):
        # For tree-based models
        feature_importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        # For linear models (e.g., Logistic Regression)
        feature_importance = np.abs(model.coef_[0])
    else:
        # Model does not provide feature importance scores
        print(f"\n{model_name} does not provide feature importance scores.")
        continue

    # Create a DataFrame to display feature names and their importance scores
    feature_importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': feature_importance,
    })

    # Sort features by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Visualize Feature Importance
    plt.figure(figsize=(16, 20))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance Score')
    plt.title(f'{model_name} - Feature Importance')
    plt.show()

    # Select top features based on importance scores
    top_features = SelectFromModel(model, threshold='mean').fit(X_train, y_train).get_support()
    print(f"\nTop Features for {model_name}:\n", X_train.columns[top_features])

In [None]:
for model_name, model in models:
    # Training
    model.fit(X_train, y_train)

    # Cross-validation scores
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"\nModel: {model_name}")
    print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.4f} (+/- {cross_val_scores.std():.4f})")

    # Testing
    y_pred = model.predict(X_test)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Additional evaluation metrics (classification report)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

**Decision Tree**:
Cross-Validation Accuracy: 51.75% (+/- 1.65)
Test Accuracy: 50.00%
Precision, recall, and F1-score are around 0.50 for both classes.

**Random Forest:**
Cross-Validation Accuracy: 52.63% (+/- 5.57)
Test Accuracy: 55.00%
Precision, recall, and F1-score are around 0.55 for both classes.

**Gradient Boosting:**
Cross-Validation Accuracy: 55.00% (+/- 4.87)
Test Accuracy: 56.50%
Precision, recall, and F1-score are around 0.56 for both classes.

**Logistic Regression:**
Cross-Validation Accuracy: 48.00% (+/- 4.86)
Test Accuracy: 50.50%
Precision, recall, and F1-score are around 0.50 for both classes.

**Support Vector Machine:**
Cross-Validation Accuracy: 49.38% (+/- 4.73)
Test Accuracy: 48.50%
Precision, recall, and F1-score are around 0.48 for both classes.

**Observations:**
None of the models are achieving high accuracy, and their performance is close to random guessing.
The precision, recall, and F1-scores for both classes are not satisfactory.


# 2Additive2-wayEpi100feat DATA

In [None]:
additive2_wayEpi100feat_df = pd.read_csv('2Additive2-wayEpi100feat.txt',delimiter='\t')
additive2_wayEpi100feat_df.head()

In [None]:
additive2_wayEpi100feat_df.shape

In [None]:
additive2_wayEpi100feat_df.tail()

In [None]:
additive2_wayEpi100feat_df.info()

In [None]:
additive2_wayEpi100feat_df.isnull().sum()

In [None]:

# Data distribution
plt.hist(additive2_wayEpi100feat_df["N40"])
plt.show()

In [None]:
# Categorical data
print(additive2_wayEpi100feat_df["Class"].value_counts())


In [None]:
plt.figure(figsize=(16, 20))
sns.set(style="whitegrid")
sns.boxplot(data=additive2_wayEpi100feat_df, orient="h", palette="Set2")
plt.title("Box Plots (or Violin Plots)")
plt.show()

In [None]:
X = additive2_wayEpi100feat_df.drop(columns=['Class'])  # Features
y = additive2_wayEpi100feat_df['Class']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
models = [
    ("Decision Tree", DecisionTreeClassifier(random_state=10)),
    ("Random Forest", RandomForestClassifier(random_state=10)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=10)),
    ("Logistic Regression", LogisticRegression(random_state=10)),
    ("Support Vector Machine", SVC(random_state=10)),
]


for model_name, model in models:
    model.fit(X_train, y_train)

    if hasattr(model, 'feature_importances_'):
        # For tree-based models
        feature_importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        # For linear models (e.g., Logistic Regression)
        feature_importance = np.abs(model.coef_[0])
    else:
        # Model does not provide feature importance scores
        print(f"\n{model_name} does not provide feature importance scores.")
        continue

    # Create a DataFrame to display feature names and their importance scores
    feature_importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': feature_importance,
    })

    # Sort features by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Visualize Feature Importance
    plt.figure(figsize=(16, 20))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance Score')
    plt.title(f'{model_name} - Feature Importance')
    plt.show()

    # Select top features based on importance scores
    top_features = SelectFromModel(model, threshold='mean').fit(X_train, y_train).get_support()
    print(f"\nTop Features for {model_name}:\n", X_train.columns[top_features])

In [None]:
for model_name, model in models:
    # Training
    model.fit(X_train, y_train)

    # Cross-validation scores
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"\nModel: {model_name}")
    print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.4f} (+/- {cross_val_scores.std():.4f})")

    # Testing
    y_pred = model.predict(X_test)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Additional evaluation metrics (classification report)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

**Decision Tree**:
Cross-Validation Accuracy: 51.38% (+/- 4.68)
Test Accuracy: 58.50%
Precision, recall, and F1-score are around 0.58 for both classes.

**Random Forest**:
Cross-Validation Accuracy: 55.12% (+/- 2.57)
Test Accuracy: 51.00%
Precision, recall, and F1-score are around 0.51 for both classes.

**Gradient Boosting**:
Cross-Validation Accuracy: 64.75% (+/- 4.60)
Test Accuracy: 62.50%
Precision, recall, and F1-score are around 0.62 for both classes.

**Logistic Regression**:
Cross-Validation Accuracy: 49.25% (+/- 3.05)
Test Accuracy: 50.00%
Precision, recall, and F1-score are around 0.50 for both classes.

**Support Vector Machine**:
Cross-Validation Accuracy: 51.88% (+/- 2.62)
Test Accuracy: 48.00%
Precision, recall, and F1-score are around 0.48 for both classes.

**Observations**:
Gradient Boosting seems to be performing relatively better compared to other models, with improved accuracy and F1-scores.
Decision Tree also shows improvement, particularly in accuracy and precision.
Random Forest, Logistic Regression, and Support Vector Machine have similar or slightly worse performance compared to the previous results.

# 4-wayHeterogeneous100feat DATA

In [None]:
# wayHeterogeneous100feat
wayHeterogeneous100feat_df = pd.read_csv('4-wayHeterogeneous100feat.txt',delimiter='\t')
wayHeterogeneous100feat_df.head()

In [None]:
wayHeterogeneous100feat_df.tail()

In [None]:
wayHeterogeneous100feat_df.info()

In [None]:
wayHeterogeneous100feat_df.isnull().sum()

In [None]:

# Data distribution
plt.hist(wayHeterogeneous100feat_df["N70"])
plt.show()

In [None]:
plt.figure(figsize=(16, 20))
sns.set(style="whitegrid")
sns.boxplot(data=wayHeterogeneous100feat_df, orient="h", palette="Set2")
plt.title("Box Plots (or Violin Plots)")
plt.show()

In [None]:
X = wayHeterogeneous100feat_df.drop(columns=['Class'])  # Features
y = wayHeterogeneous100feat_df['Class']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [None]:
models = [
    ("Decision Tree", DecisionTreeClassifier(random_state=10)),
    ("Random Forest", RandomForestClassifier(random_state=10)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=10)),
    ("Logistic Regression", LogisticRegression(random_state=10)),
    ("Support Vector Machine", SVC(random_state=10)),
]


for model_name, model in models:
    model.fit(X_train, y_train)

    if hasattr(model, 'feature_importances_'):
        # For tree-based models
        feature_importance = model.feature_importances_
    elif hasattr(model, 'coef_'):
        # For linear models (e.g., Logistic Regression)
        feature_importance = np.abs(model.coef_[0])
    else:
        # Model does not provide feature importance scores
        print(f"\n{model_name} does not provide feature importance scores.")
        continue

    # Create a DataFrame to display feature names and their importance scores
    feature_importance_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': feature_importance,
    })

    # Sort features by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Visualize Feature Importance
    plt.figure(figsize=(16, 20))
    plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
    plt.xlabel('Importance Score')
    plt.title(f'{model_name} - Feature Importance')
    plt.show()

    # Select top features based on importance scores
    top_features = SelectFromModel(model, threshold='mean').fit(X_train, y_train).get_support()
    print(f"\nTop Features for {model_name}:\n", X_train.columns[top_features])

In [None]:
for model_name, model in models:
    # Training
    model.fit(X_train, y_train)

    # Cross-validation scores
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"\nModel: {model_name}")
    print(f"Cross-Validation Accuracy: {cross_val_scores.mean():.4f} (+/- {cross_val_scores.std():.4f})")

    # Testing
    y_pred = model.predict(X_test)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Additional evaluation metrics (classification report)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

**Decision Tree:**
Cross-Validation Accuracy: 62.00% (+/- 2.18)
Test Accuracy: 62.50%
Precision, recall, and F1-score are around 0.62 for both classes.

**Random Forest:**
Cross-Validation Accuracy: 64.75% (+/- 2.64)
Test Accuracy: 69.50%
Precision, recall, and F1-score are around 0.69 for both classes.

**Gradient Boosting:**
Cross-Validation Accuracy: 66.25% (+/- 1.31)
Test Accuracy: 71.50%
Precision, recall, and F1-score are around 0.71 for both classes.

**Logistic Regression:*
Cross-Validation Accuracy: 62.88% (+/- 1.88)
Test Accuracy: 63.00%
Precision, recall, and F1-score are around 0.63 for both classes.

**Support Vector Machine:**
Cross-Validation Accuracy: 63.75% (+/- 1.37)
Test Accuracy: 64.50%
Precision, recall, and F1-score are around 0.64 for both classes.

**Observations:**
Random Forest, Gradient Boosting, and Support Vector Machine show good performance, with accuracy above 65%.
Gradient Boosting has the highest test accuracy at 71.50%.
The models appear to be balanced, with similar precision, recall, and F1-scores for both classes.