In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Set up the Streamlit app
st.title("Regression Analysis Web App")
st.write("Upload your dataset for regression analysis.")

# File uploader for user to upload their dataset
uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])

if uploaded_file is not None:
    # Read the uploaded file
    df = pd.read_csv(uploaded_file) if uploaded_file.name.endswith(".csv") else pd.read_excel(uploaded_file)
    st.write("Dataset Preview:")
    st.dataframe(df.head())

    # Data preprocessing
    st.write("Data Cleaning and Preparation")
    df = df.dropna()  # Drop missing values
    st.write("Dropped missing values.")

    # Select relevant columns
    st.write("Select the columns for analysis.")
    columns = st.multiselect("Select columns for analysis:", df.columns.tolist(), default=df.columns.tolist())
    df = df[columns]

    # Display basic info
    st.write("Basic Information:")
    st.write(df.info())

    # Display visualizations
    st.write("Visualizations")

    # Boxplot
    st.write("Boxplot")
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.boxplot(data=df, ax=ax)
    st.pyplot(fig)

    # Histogram
    st.write("Histogram")
    fig, ax = plt.subplots(figsize=(10, 6))
    df.hist(bins=20, ax=ax)
    st.pyplot(fig)

    # Scatter plot matrix
    st.write("Scatter Plot Matrix")
    fig = sns.pairplot(df)
    st.pyplot(fig)

    # Correlation heatmap
    st.write("Correlation Heatmap")
    corr = df.corr()
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
    st.pyplot(fig)

    # Encode categorical variables
    le = LabelEncoder()
    for column in df.select_dtypes(include='object').columns:
        df[column] = le.fit_transform(df[column])

    # Split the data into features and target variable
    X = df.drop(columns=['Salary'])  # Assuming 'Salary' is the target
    y = df['Salary']

    # Linear Regression
    linear_reg = LinearRegression()
    linear_reg.fit(X, y)
    y_pred = linear_reg.predict(X)
    error = np.sqrt(mean_squared_error(y, y_pred))
    st.write(f"Linear Regression RMSE: ${error:,.02f}")

    # Decision Tree Regression
    dec_tree_reg = DecisionTreeRegressor(random_state=0)
    dec_tree_reg.fit(X, y)
    y_pred = dec_tree_reg.predict(X)
    error = np.sqrt(mean_squared_error(y, y_pred))
    st.write(f"Decision Tree Regression RMSE: ${error:,.02f}")

    # Random Forest Regression
    random_forest_reg = RandomForestRegressor(random_state=0)
    random_forest_reg.fit(X, y)
    y_pred = random_forest_reg.predict(X)
    error = np.sqrt(mean_squared_error(y, y_pred))
    st.write(f"Random Forest Regression RMSE: ${error:,.02f}")

    # Save the model
    import pickle
    model_data = {"model": random_forest_reg, "le": le}
    with open('regression_model.pkl', 'wb') as file:
        pickle.dump(model_data, file)

    st.write("Model saved as 'regression_model.pkl'")

    st.write("App finished processing.")

else:
    st.write("Awaiting CSV/XLSX file to be uploaded.")
