In [1]:
%%writefile dataanalysis.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import streamlit as st
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Streamlit app
st.title('Protein Data Analysis')

# Upload CSV file
uploaded_file = st.file_uploader("Choose a CSV file", type="csv")

if uploaded_file is not None:
    # Load the dataset
    data = pd.read_csv(uploaded_file)
    
    # Display basic information about the dataset
    st.subheader('Data Overview')
    st.write(data.info())
    st.write(data.head())

    # Drop the 'MouseID' column
    data = data.drop('MouseID', axis=1)

    # Encode categorical variables
    data['Genotype'] = data['Genotype'].map({'Control': 0, 'Ts65Dn': 1})
    data['Treatment'] = data['Treatment'].map({'Saline': 0, 'Memantine': 1})
    data['Behavior'] = data['Behavior'].map({'C/S': 0, 'S/C': 1})

    # Label encode the 'class' column
    encode = LabelEncoder().fit(data['class'])
    data['class'] = encode.transform(data['class'])

    # Save the encoder using pickle
    with open('enc.pickle', 'wb') as f:
        pickle.dump(encode, f)

    # Handle missing values
    missing_values = data.isnull().mean() * 100
    missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
    st.subheader('Missing Values')
    st.write(missing_values)

    # Impute missing values using KNN imputer
    if missing_values.sum() > 0:
        imputer = KNNImputer(n_neighbors=5, weights='uniform', missing_values=np.nan)
        data_imputed = imputer.fit_transform(data)
        data = pd.DataFrame(data_imputed, columns=data.columns)

    st.write(f'Total missing values after imputation: {data.isnull().sum().sum()}')

    # Normalize the data
    numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Split the data into features and labels
    X = data.drop('class', axis=1)
    y = data['class']
    
    # Ensure the target variable is discrete
    if not np.issubdtype(y.dtype, np.integer):
        y = y.astype(int)

    # Correlation Analysis
    st.subheader('Correlation Matrix')
    corr_matrix = data.corr()
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5, ax=ax)
    st.pyplot(fig)

    # Mutual Information
    st.subheader('Mutual Information')
    numerical_features = data.drop(columns=['Genotype', 'Treatment', 'Behavior', 'class'])
    target = data['class']

    # Ensure the target variable is discrete
    if not np.issubdtype(target.dtype, np.integer):
        target = target.astype(int)

    mi = mutual_info_classif(numerical_features, target, discrete_features=False)
    mi_df = pd.DataFrame({'Feature': numerical_features.columns, 'Mutual Information': mi})
    mi_df = mi_df.sort_values(by='Mutual Information', ascending=False)

    fig, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(x='Mutual Information', y='Feature', data=mi_df, ax=ax)
    st.pyplot(fig)

    # Feature Importance from RandomForest
    st.subheader('Feature Importance from RandomForest')
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(numerical_features, target)
    importances = rf_model.feature_importances_

    feature_importance_df = pd.DataFrame({'Feature': numerical_features.columns, 'Importance': importances})
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    fig, ax = plt.subplots(figsize=(12, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df, ax=ax)
    st.pyplot(fig)

else:
    st.write("Please upload a CSV file to proceed.")


Overwriting dataanalysis.py


In [None]:
!streamlit run dataanalysis.py