In [1]:
# Step 1: Data Cleaning - Fixing the ARFF File

def fix_arff_file(filepath, output_filepath):
    # Read the ARFF file as text
    with open(filepath, 'r') as file:
        lines = file.readlines()
    
    # Process lines to find and rename duplicate columns
    columns_seen = set()
    for i, line in enumerate(lines):
        if line.lower().startswith('@attribute'):
            # Extract the column name
            parts = line.split()
            col_name = parts[1].strip("'")
            
            # Check for duplicates
            if col_name in columns_seen:
                # Append a suffix to make it unique
                new_col_name = f"{col_name}_{len(columns_seen)}"
                lines[i] = line.replace(col_name, new_col_name)
            else:
                columns_seen.add(col_name)
    
    # Write the fixed lines to a new ARFF file
    with open(output_filepath, 'w') as file:
        file.writelines(lines)

# Apply this function to your lung dataset
fix_arff_file('Lung.arff', 'Lung_fixed.arff')


In [2]:
import pandas as pd
from scipy.io import arff

# Step 2.1: Load the Fixed ARFF File
def load_arff_data(filepath):
    # Load the ARFF file
    data, meta = arff.loadarff(filepath)
    
    # Convert the data into a pandas DataFrame
    df = pd.DataFrame(data)
    
    # Decode byte strings (if any) to standard strings
    for column in df.select_dtypes([object]).columns:
        df[column] = df[column].str.decode('utf-8')
    
    return df

# Load the fixed lung dataset
lung_df = load_arff_data('Lung_fixed.arff')

# Step 2.2: Explore the Data
# Display the first few rows of the dataset
print(lung_df.head())

# Check the data types of the columns
print(lung_df.dtypes)

# Get a summary of the data (e.g., number of missing values, basic stats)
print(lung_df.describe())

# Check for missing values
print(lung_df.isnull().sum())

# Check for duplicate rows
print(lung_df.duplicated().sum())


   AFFX-MurIL2_at  AFFX-MurIL10_at  AFFX-MurIL4_at  AFFX-MurFAS_at  \
0         -18.600            10.54           0.010          19.440   
1           9.120             9.12          10.180          29.290   
2          -2.175            -2.21          -0.060           6.320   
3          -1.540            21.75           5.835          23.815   
4          -9.070             3.08          -1.980          17.260   

   AFFX-BioB-5_at  AFFX-BioB-M_at  AFFX-BioB-3_at  AFFX-BioC-5_at  \
0         -16.980          -27.50          -1.600           38.88   
1          -4.680           -1.50          -3.620           20.80   
2          -1.775          -16.53          -3.610           16.41   
3         -24.785          -12.89          -4.485           19.50   
4         -10.090          -15.15         -18.190           13.21   

   AFFX-BioC-3_at  AFFX-BioDn-5_at  ...  101_at  102_at  103_at  104_at  \
0         -29.120          -42.870  ...   8.110  33.210  30.780   47.80   
1         -13.

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np

# Step 3.1: Handle the Target Variable
def preprocess_target(df, target_col='type'):
    le = LabelEncoder()
    df[target_col] = le.fit_transform(df[target_col])
    return df

lung_df = preprocess_target(lung_df)

# Step 3.2: Feature Selection
def select_features(df, target_col='type', top_k=100):
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    # SelectKBest with ANOVA F-value
    selector = SelectKBest(score_func=f_classif, k=top_k)
    X_new = selector.fit_transform(X, y)
    
    # Get the indices of selected features
    selected_indices = selector.get_support(indices=True)
    selected_features = X.columns[selected_indices]
    
    return df[selected_features.tolist() + [target_col]]

# Select top 100 features
lung_df_selected = select_features(lung_df, top_k=100)

# Step 3.3: Data Normalization/Standardization
def normalize_data(df):
    scaler = StandardScaler()
    X = df.drop(columns=['type'])
    y = df['type']
    
    X_scaled = scaler.fit_transform(X)
    
    return pd.DataFrame(X_scaled, columns=X.columns), y

X_scaled, y = normalize_data(lung_df_selected)

# Step 3.4: Split the Dataset
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X_scaled, y)

# Output some basic information about the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 162
Test set size: 41


In [4]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2, f_classif, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load ARFF data into DataFrame
def load_arff_to_df(filepath):
    data, meta = arff.loadarff(filepath)
    df = pd.DataFrame(data)
    return df

# Load the dataset
df = load_arff_to_df('Lung_fixed.arff')

# Separate features and target
X = df.drop(columns=['type'])
y = df['type'].astype('str')  # Convert target to string if it's categorical



In [5]:
from sklearn.feature_selection import mutual_info_classif

# 1. Mutual Information
mi = mutual_info_classif(X, y)
mi_scores = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print("Mutual Information Scores:")
print(mi_scores)


Mutual Information Scores:
41325_at      0.522847
38138_at      0.501622
33322_i_at    0.487682
36924_r_at    0.476928
32254_at      0.472818
                ...   
33710_at      0.000000
33698_at      0.000000
1601_s_at     0.000000
33253_at      0.000000
36465_at      0.000000
Length: 12600, dtype: float64


In [6]:
from sklearn.feature_selection import SelectKBest, chi2

# Ensure data is non-negative
X_non_negative = X.clip(lower=0)

# 2. Chi-Square
chi2_selector = SelectKBest(chi2, k='all')
X_chi2 = chi2_selector.fit_transform(X_non_negative, y)
chi2_scores = pd.Series(chi2_selector.scores_, index=X.columns).sort_values(ascending=False)
print("Chi-Square Scores:")
print(chi2_scores)


Chi-Square Scores:
40808_at      792239.030250
33377_at      420249.997973
32252_at      419108.344009
613_at        323155.060274
39026_r_at    320199.463616
                  ...      
1821_at                 NaN
1777_at                 NaN
1374_g_at               NaN
499_at                  NaN
345_at                  NaN
Length: 12600, dtype: float64


In [7]:
from sklearn.feature_selection import f_classif

# 3. ANOVA F-Value
f_values, _ = f_classif(X, y)
anova_scores = pd.Series(f_values, index=X.columns).sort_values(ascending=False)
print("ANOVA F-Value Scores:")
print(anova_scores)


ANOVA F-Value Scores:
36160_s_at    368.030678
32254_at      342.758796
36148_at      321.307693
40825_at      298.749976
40165_at      268.788325
                 ...    
36600_at        0.107660
33659_at        0.102819
1818_at         0.081640
38189_s_at      0.068180
847_at          0.036264
Length: 12600, dtype: float64


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply Lasso with standardized features
lasso = Lasso(alpha=0.1)  # Adjust alpha as needed
lasso.fit(X_scaled, y)
model = SelectFromModel(lasso, prefit=True)
X_lasso = model.transform(X_scaled)
lasso_support = model.get_support()
lasso_features = X.columns[lasso_support]
print("Lasso Selected Features with Scaling:")
print(lasso_features)


Lasso Selected Features with Scaling:
Index(['31638_at', '33693_at', '33529_at', '35414_s_at', '37422_at',
       '37826_at', '38202_at', '38918_at', '39581_at', '39625_at', '39670_at',
       '40304_at', '41385_at', '31791_at', '32025_at', '32028_at', '32034_at',
       '32626_at', '33230_at', '33808_at', '35177_at', '37603_at', '38643_at',
       '39685_at', '39795_at', '39799_at', '40128_at', '40410_at', '40766_at',
       '40808_at', '41156_g_at', '41197_at', '33375_at', '33423_g_at',
       '35840_at', '36105_at', '36139_at', '36209_at', '37044_at', '37363_at',
       '38368_at', '38814_at', '39561_at', '40581_at', '40961_at', '41337_at',
       '41338_at', '41498_at', '41809_at', '1420_s_at', '1317_at', '897_at',
       '613_at', '402_s_at', '376_at', '319_g_at'],
      dtype='object')


In [28]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

# 5. Variance Threshold
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
variance_threshold = VarianceThreshold(threshold=0.01)  # Adjust the threshold as needed
X_var = variance_threshold.fit_transform(X_scaled)
var_scores = pd.Series(variance_threshold.variances_, index=X.columns[variance_threshold.get_support()]).sort_values(ascending=False)
print("Variance Threshold Scores:")
print(var_scores)


Variance Threshold Scores:
35553_at      1.0
39768_at      1.0
39872_at      1.0
1592_at       1.0
32255_i_at    1.0
             ... 
33331_at      1.0
35739_at      1.0
40908_r_at    1.0
34623_at      1.0
32957_g_at    1.0
Length: 12600, dtype: float64


In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# 6. Recursive Feature Elimination (RFE)
rfe_model = RandomForestClassifier()
rfe = RFE(estimator=rfe_model, n_features_to_select=10)  # Select the top 10 features
rfe.fit(X, y)
rfe_support = pd.Series(rfe.support_, index=X.columns)
rfe_ranking = pd.Series(rfe.ranking_, index=X.columns).sort_values(ascending=True)
print("RFE Feature Rankings (1 is best):")
print(rfe_ranking)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# 7. Tree-based Feature Selection
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X, y)
forest_importances = pd.Series(forest.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Random Forest Feature Importances:")
print(forest_importances)


In [10]:
import numpy as np

# Combine scores from different methods
combined_scores = pd.DataFrame({
    'Mutual Information': mi_scores,
    'Chi-Square': chi2_scores,
    'ANOVA F-Value': anova_scores
}).fillna(0)

# Compute average score for each feature
combined_scores['Average'] = combined_scores.mean(axis=1)

# Sort by average score
combined_scores_sorted = combined_scores['Average'].sort_values(ascending=False)
print("Combined Feature Scores:")
print(combined_scores_sorted)


Combined Feature Scores:
40808_at      264162.129960
33377_at      140102.462342
32252_at      139720.702199
613_at        107748.424703
39026_r_at    106783.115692
                  ...      
1869_at            0.185449
37113_at           0.183769
40531_at           0.178674
34627_at           0.174206
38189_s_at         0.022727
Name: Average, Length: 12600, dtype: float64


In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Normalize scores
scaler = MinMaxScaler()
normalized_scores = scaler.fit_transform(combined_scores_sorted.values.reshape(-1, 1))

# Construct decision matrix
decision_matrix = pd.DataFrame(normalized_scores, index=combined_scores_sorted.index, columns=['Score'])

# Determine ideal and negative-ideal solutions
ideal_solution = decision_matrix.max()
negative_ideal_solution = decision_matrix.min()

# Calculate distances
distance_to_ideal = np.sqrt(((decision_matrix - ideal_solution) ** 2).sum(axis=1))
distance_to_negative_ideal = np.sqrt(((decision_matrix - negative_ideal_solution) ** 2).sum(axis=1))

# Calculate TOPSIS scores
topsis_scores = distance_to_negative_ideal / (distance_to_negative_ideal + distance_to_ideal)

# Rank features
ranked_features = pd.Series(topsis_scores, index=combined_scores_sorted.index).sort_values(ascending=False)
print("TOPSIS Scores:")
print(ranked_features)


TOPSIS Scores:
40808_at      1.000000e+00
33377_at      5.303654e-01
32252_at      5.289202e-01
613_at        4.078874e-01
39026_r_at    4.042332e-01
                  ...     
1869_at       6.159950e-07
37113_at      6.096356e-07
40531_at      5.903472e-07
34627_at      5.734324e-07
38189_s_at    0.000000e+00
Length: 12600, dtype: float64


In [12]:
top_features = ranked_features.head(int(0.1 * len(ranked_features)))  # Select top 10%
print("Top Features Based on TOPSIS Scores:")
print(top_features)

Top Features Based on TOPSIS Scores:
40808_at      1.000000
33377_at      0.530365
32252_at      0.528920
613_at        0.407887
39026_r_at    0.404233
                ...   
40423_at      0.005130
37360_at      0.005120
36945_at      0.005119
34592_at      0.005114
39294_at      0.005114
Length: 1260, dtype: float64


In [13]:
# Example of extracting top features from the original dataset
top_features_names = top_features.index
X_top_features = X[top_features_names]


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_top_features, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with Top Features: {accuracy:.2f}")

Model Accuracy with Top Features: 0.93


In [17]:
importances = model.feature_importances_
importance_df = pd.DataFrame({'Feature': top_features_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(importance_df)


Feature Importances:
         Feature  Importance
899     38368_at    0.015887
96      41338_at    0.012780
819     32591_at    0.012300
101     39266_at    0.012051
919     41418_at    0.011675
...          ...         ...
563   31951_s_at    0.000000
564     31722_at    0.000000
565     38261_at    0.000000
566     38604_at    0.000000
1259    39294_at    0.000000

[1260 rows x 2 columns]


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Random Forest model
rf = RandomForestClassifier(random_state=42)

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit model
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Use the best model
best_rf = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy:.2f}")


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best Accuracy: 0.9293103448275863
Optimized Model Accuracy: 0.92


In [21]:
from sklearn.preprocessing import LabelEncoder

# Convert string labels to integers
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)

# Split data with numeric labels
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric, test_size=0.3, random_state=42)

# Initialize and train XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Model Accuracy: {accuracy:.2f}")


Parameters: { "use_label_encoder" } are not used.



XGBoost Model Accuracy: 0.89


In [22]:
from sklearn.ensemble import VotingClassifier

# Initialize models
rf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Create Voting Classifier
voting_clf = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb_model)], voting='soft')

# Train Voting Classifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred = voting_clf.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier Accuracy: {accuracy:.2f}")


Parameters: { "use_label_encoder" } are not used.



Voting Classifier Accuracy: 0.90


In [23]:
from sklearn.preprocessing import PolynomialFeatures

# Initialize PolynomialFeatures with a lower degree
poly = PolynomialFeatures(degree=1)  # Degree 1 is equivalent to original features
X_poly = poly.fit_transform(X_scaled)

# Proceed with splitting and training
X_train_poly, X_test_poly, y_train, y_test = train_test_split(X_poly, y_numeric, test_size=0.3, random_state=42)


In [25]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature Selection using RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_scaled, y)
feature_importances = rf_model.feature_importances_
important_features = SelectFromModel(rf_model, threshold='mean', prefit=True)
X_important = important_features.transform(X_scaled)

# Dimensionality Reduction using PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_important)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with Feature Selection and PCA: {accuracy:.2f}")

# Feature Importances
importances = model.feature_importances_
importance_df = pd.DataFrame({'Feature': np.arange(len(importances)), 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances:")
print(importance_df)


Model Accuracy with Feature Selection and PCA: 0.74
Feature Importances:
     Feature  Importance
2          2    0.101832
3          3    0.087548
1          1    0.084369
0          0    0.043300
13        13    0.040454
..       ...         ...
26        26    0.001433
123      123    0.001296
122      122    0.001117
125      125    0.001089
31        31    0.001036

[128 rows x 2 columns]


In [26]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Train model
xgb_model.fit(X_train, y_train_encoded)

# Make predictions
y_pred_encoded = xgb_model.predict(X_test)

# Convert predictions back to original labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Model Accuracy: {accuracy:.2f}")


XGBoost Model Accuracy: 0.93


Parameters: { "use_label_encoder" } are not used.

