## Phase 1: Understanding and Analyzing the Data

### Reading and Understanding the Data
- **Explanation:**  
  - We load the training and testing data using the Pandas library to understand their structure.  
  - We use `pd.read_csv` to read files like `train_feats.csv` and `test_feats.csv`, along with mutation and methylation data.  

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif

# Phase 1: Data Loading and Exploration
print("Loading data...")
train_muts = pd.read_csv('train_muts_data.csv')
test_muts = pd.read_csv('test_muts_data.csv')
train_feats = pd.read_csv('train_feats.csv')
test_feats = pd.read_csv('test_feats.csv')
genes_100 = pd.read_csv('100_genes.csv')
train_meth = pd.read_csv('train_meth_data.csv')
test_meth = pd.read_csv('test_meth_data.csv')

# Check for missing values in all datasets
print("Checking for missing values:")
for df, name in [(train_muts, 'train_muts'), (test_muts, 'test_muts'), (train_feats, 'train_feats'), 
                (test_feats, 'test_feats'), (train_meth, 'train_meth'), (test_meth, 'test_meth')]:
    print(f"{name}: {df.isnull().sum().sum()} missing values")

# Verify that genes in mutation and methylation data are in 100_genes.csv
gene_set = set(genes_100['gene'])
assert set(train_muts['Gene_name']).issubset(gene_set), "Mutation genes not in 100_genes"
assert set(train_meth['matching_genes']).issubset(gene_set), "Methylation genes not in 100_genes"


Loading data...
Checking for missing values:
train_muts: 0 missing values
test_muts: 0 missing values
train_feats: 0 missing values
test_feats: 0 missing values
train_meth: 0 missing values
test_meth: 0 missing values


### Exploring the Data
- **Explanation:**  
  - We examine the distribution of labels in the training data to check for class balance using `value_counts()`.  
  - We check for missing values using `isnull().sum()` to ensure data quality.  

In [2]:
# Display label distribution and missing values in feature files
print("Label Distribution in Training Data:")
print(train_feats['Label'].value_counts())
print("Missing Values in Training Features:")
print(train_feats.isnull().sum())
print("Missing Values in Testing Features:")
print(test_feats.isnull().sum())

Label Distribution in Training Data:
Label
2.0    408
1.0    397
Name: count, dtype: int64
Missing Values in Training Features:
case_id               0
Label                 0
Mutations_in_ABL1     0
Mutations_in_AKT1     0
Mutations_in_ALK      0
                     ..
Mutations_in_TSC1     0
Mutations_in_TSC2     0
Mutations_in_U2AF1    0
Mutations_in_VHL      0
Mutations_in_WT1      0
Length: 102, dtype: int64
Missing Values in Testing Features:
case_id               0
Mutations_in_ABL1     0
Mutations_in_AKT1     0
Mutations_in_ALK      0
Mutations_in_APC      0
                     ..
Mutations_in_TSC1     0
Mutations_in_TSC2     0
Mutations_in_U2AF1    0
Mutations_in_VHL      0
Mutations_in_WT1      0
Length: 101, dtype: int64


---
#### 2.

In [3]:
# Phase 2: Feature Creation for Task 1
print("Creating mutation features...")
# Total mutations per patient
train_total_muts = train_muts.groupby('case_id').size().reset_index(name='Total_Mutations')
test_total_muts = test_muts.groupby('case_id').size().reset_index(name='Total_Mutations')

# Mutations per variant classification
train_var_counts = train_muts.groupby(['case_id', 'Variant_Classification']).size().unstack(fill_value=0)
train_var_counts.columns = [f'Mutations_{col}' for col in train_var_counts.columns]
test_var_counts = test_muts.groupby(['case_id', 'Variant_Classification']).size().unstack(fill_value=0)
test_var_counts.columns = [f'Mutations_{col}' for col in test_var_counts.columns]

# Mutations per variant classification per gene
train_gene_var = train_muts.groupby(['case_id', 'Gene_name', 'Variant_Classification']).size().unstack(level=[1, 2], fill_value=0)
train_gene_var.columns = [f'Mutations_in_{gene}_{var}' for gene, var in train_gene_var.columns]
test_gene_var = test_muts.groupby(['case_id', 'Gene_name', 'Variant_Classification']).size().unstack(level=[1, 2], fill_value=0)
test_gene_var.columns = [f'Mutations_in_{gene}_{var}' for gene, var in test_gene_var.columns]

# Additional feature: Mutation types (Transition, Transversion, Insertion, Deletion, Other)
# Justification: Different mutation types may indicate distinct biological mechanisms in cancer
def classify_mutation(row):
    ref, tumor = row['Reference_Allele'], row['Tumor_Seq_Allele1']
    if len(ref) == len(tumor) == 1:
        transitions = {('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')}
        return 'Transition' if (ref, tumor) in transitions else 'Transversion'
    elif len(ref) > len(tumor):
        return 'Deletion'
    elif len(ref) < len(tumor):
        return 'Insertion'
    return 'Other'

train_muts['Mut_Type'] = train_muts.apply(classify_mutation, axis=1)
test_muts['Mut_Type'] = test_muts.apply(classify_mutation, axis=1)
train_mut_types = train_muts.groupby(['case_id', 'Mut_Type']).size().unstack(fill_value=0)
train_mut_types.columns = [f'Mutations_{col}' for col in train_mut_types.columns]
test_mut_types = test_muts.groupby(['case_id', 'Mut_Type']).size().unstack(fill_value=0)
test_mut_types.columns = [f'Mutations_{col}' for col in test_mut_types.columns]

# Additional feature: Strand bias
# Justification: Strand bias may reveal preferences in mutation or DNA repair mechanisms
train_strand = train_muts.groupby('case_id')['Mut_Strand'].apply(lambda x: (x == '+').mean()).reset_index(name='Strand_Bias')
test_strand = test_muts.groupby('case_id')['Mut_Strand'].apply(lambda x: (x == '+').mean()).reset_index(name='Strand_Bias')

# Additional feature: Normalized mutation rate
# Justification: Normalizing by gene length accounts for bias due to varying gene sizes
genes_100['Length'] = genes_100['Sequence'].str.len()
train_norm_muts = train_muts.groupby(['case_id', 'Gene_name']).size().reset_index(name='Count')
train_norm_muts = train_norm_muts.merge(genes_100[['gene', 'Length']], left_on='Gene_name', right_on='gene')
train_norm_muts['Norm_Mutations'] = train_norm_muts['Count'] / train_norm_muts['Length']
train_norm_muts = train_norm_muts.pivot(index='case_id', columns='Gene_name', values='Norm_Mutations').fillna(0)
train_norm_muts.columns = [f'Norm_Mutations_in_{col}' for col in train_norm_muts.columns]
test_norm_muts = test_muts.groupby(['case_id', 'Gene_name']).size().reset_index(name='Count')
test_norm_muts = test_norm_muts.merge(genes_100[['gene', 'Length']], left_on='Gene_name', right_on='gene')
test_norm_muts['Norm_Mutations'] = test_norm_muts['Count'] / test_norm_muts['Length']
test_norm_muts = test_norm_muts.pivot(index='case_id', columns='Gene_name', values='Norm_Mutations').fillna(0)
test_norm_muts.columns = [f'Norm_Mutations_in_{col}' for col in test_norm_muts.columns]

# Combine all mutation features
train_features = train_total_muts.set_index('case_id').join([train_var_counts, train_gene_var, train_mut_types, 
                                                            train_strand.set_index('case_id'), train_norm_muts]).reset_index()
test_features = test_total_muts.set_index('case_id').join([test_var_counts, test_gene_var, test_mut_types, 
                                                        test_strand.set_index('case_id'), test_norm_muts]).reset_index()

Creating mutation features...


---
### 3.

In [None]:
# Phase 3: Graph Creation
print("Creating graph...")
var_totals = train_muts['Variant_Classification'].value_counts()
plt.figure(figsize=(10, 6))
var_totals.plot(kind='bar')
plt.xlabel('Variant Classification')
plt.ylabel('Number of Mutations')
plt.title('Mutation Distribution by Variant Type')
plt.savefig('mutation_distribution.png')
plt.show('mutation_distribution.png')
plt.close()

Creating graph...


---

In [None]:
# Phase 4: Classifier for Task 1
print("Training Task 1 classifier...")
# Merge features with labels
train_data = train_features.merge(train_feats[['case_id', 'Label']], on='case_id')

# Prepare training and test features
X_train = train_data.drop(columns=['case_id', 'Label']).fillna(0)
y_train = train_data['Label']
X_test = test_features.drop(columns=['case_id']).fillna(0)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Feature selection: Select top 50 features using SelectKBest
selector = SelectKBest(score_func=f_classif, k=50)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Split data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_selected, y_train, test_size=0.2, 
                                                            stratify=y_train, random_state=42)

# Define and train SVM with expanded grid search
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'degree': [2, 3, 4], 'gamma': ['scale', 'auto', 0.1, 1]}
svm = SVC(kernel='poly', random_state=42)
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_split, y_train_split)

# Output results
print(f"Best params: {grid_search.best_params_}")
y_val_pred = grid_search.predict(X_val)
error = (y_val_pred != y_val).mean()
print(f"Validation error: {error}")


Training Task 1 classifier...


  f = msb / msw


In [None]:
# Train final model on full training data and predict on test set
svm_final = SVC(kernel='poly', C=grid_search.best_params_['C'], degree=grid_search.best_params_['degree'], 
                gamma=grid_search.best_params_['gamma'], random_state=42)
svm_final.fit(X_train_selected, y_train)
y_test_pred = svm_final.predict(X_test_selected)

# Save predictions
pd.DataFrame({'id_case': test_features['case_id'], 'label_predict': y_test_pred}).to_csv('task1_predictions.csv', index=False)


---

In [None]:
# Phase 5: Feature Creation for Task 2
print("Creating methylation features...")
# Required feature: Average methylation per gene
train_meth_avg = train_meth.groupby(['case_id', 'matching_genes'])['beta_val'].mean().unstack(fill_value=0)
train_meth_avg.columns = [f'Meth_Avg_{col}' for col in train_meth_avg.columns]
test_meth_avg = test_meth.groupby(['case_id', 'matching_genes'])['beta_val'].mean().unstack(fill_value=0)
test_meth_avg.columns = [f'Meth_Avg_{col}' for col in test_meth_avg.columns]

# Additional feature: Standard deviation of methylation values
# Justification: Captures variability in methylation levels, which may indicate heterogeneous epigenetic patterns
train_meth_std = train_meth.groupby(['case_id', 'matching_genes'])['beta_val'].std().unstack(fill_value=0)
train_meth_std.columns = [f'Meth_Std_{col}' for col in train_meth_std.columns]
test_meth_std = test_meth.groupby(['case_id', 'matching_genes'])['beta_val'].std().unstack(fill_value=0)
test_meth_std.columns = [f'Meth_Std_{col}' for col in test_meth_std.columns]

# Additional feature: Proportion of high-methylation sites (beta_val > 0.7)
# Justification: High methylation levels may be associated with gene silencing in cancer
train_meth_high = train_meth[train_meth['beta_val'] > 0.7].groupby(['case_id', 'matching_genes']).size().unstack(fill_value=0)
train_meth_high = train_meth_high.div(train_meth.groupby(['case_id', 'matching_genes']).size().unstack(fill_value=1)).fillna(0)
train_meth_high.columns = [f'Meth_High_Prop_{col}' for col in train_meth_high.columns]
test_meth_high = test_meth[test_meth['beta_val'] > 0.7].groupby(['case_id', 'matching_genes']).size().unstack(fill_value=0)
test_meth_high = test_meth_high.div(test_meth.groupby(['case_id', 'matching_genes']).size().unstack(fill_value=1)).fillna(0)
test_meth_high.columns = [f'Meth_High_Prop_{col}' for col in test_meth_high.columns]

# Combine mutation and methylation features
train_combined = train_features.set_index('case_id').join([train_meth_avg, train_meth_std, train_meth_high]).reset_index()
test_combined = test_features.set_index('case_id').join([test_meth_avg, test_meth_std, test_meth_high]).reset_index()


Creating methylation features...


In [None]:
# Phase 6: Classifier for Task 2
print("Training Task 2 classifier...")
# Prepare training and test features
X_train_combined = train_combined.drop(columns=['case_id']).fillna(0)
X_test_combined = test_combined.drop(columns=['case_id']).fillna(0)
X_test_combined = X_test_combined.reindex(columns=X_train_combined.columns, fill_value=0)

# Feature selection: Select top 50 features
selector = SelectKBest(score_func=f_classif, k=50)
X_train_combined_selected = selector.fit_transform(X_train_combined, y_train)
X_test_combined_selected = selector.transform(X_test_combined)

# Split data for validation
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_combined_selected, y_train, test_size=0.2, 
                                                            stratify=y_train, random_state=42)

# Train SVM with expanded grid search
grid_search.fit(X_train_split, y_train_split)

# Output results
print(f"Best params: {grid_search.best_params_}")
y_val_pred = grid_search.predict(X_val)
error = (y_val_pred != y_val).mean()
print(f"Validation error: {error}")

# Train final model and predict on test set
svm_final = SVC(kernel='poly', C=grid_search.best_params_['C'], degree=grid_search.best_params_['degree'], 
                gamma=grid_search.best_params_['gamma'], random_state=42)
svm_final.fit(X_train_combined_selected, y_train)
y_test_pred = svm_final.predict(X_test_combined_selected)

# Save predictions
pd.DataFrame({'id_case': test_combined['case_id'], 'label_predict': y_test_pred}).to_csv('task2_predictions.csv', index=False)

print("Project completed. Predictions saved.")

Training Task 2 classifier...
Best params: {'C': 10, 'degree': 2}
Validation error: 0.37888198757763975
Task 2 completed. Predictions saved.
