In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set the style for seaborn
sns.set(style="whitegrid")

# Load the dataset
def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully from {file_path}")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Explore the dataset
def explore_data(data):
    print("\n--- Dataset Overview ---")
    print(data.head())  # Display the first few rows
    print("\n--- Dataset Info ---")
    print(data.info())  # Display info about the dataset
    print("\n--- Statistical Summary ---")
    print(data.describe())  # Display statistical summary
    print("\n--- Missing Values ---")
    print(data.isnull().sum())  # Check for missing values

# Visualize data (example: histogram of a numeric column)
def visualize_data(data, column):
    plt.figure(figsize=(10, 6))
    sns.histplot(data[column], bins=30, kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Preprocess the data (example: handle missing values and encode categorical variables)
def preprocess_data(data):
    # Fill missing values (example: fill with mean for numeric columns)
    for column in data.select_dtypes(include=[np.number]).columns:
        data[column].fillna(data[column].mean(), inplace=True)

    # Convert categorical variables to dummy/indicator variables
    data = pd.get_dummies(data, drop_first=True)
    
    return data

# Train a simple machine learning model (example: Linear Regression)
def train_model(data, target_column):
    # Split the data into features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\n--- Model Evaluation ---")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R^2 Score: {r2:.2f}")

# Main function to run the project
def main():
    # Specify the path to your CSV file
    file_path = 'path/to/your/dataset.csv'
    
    # Load the data
    data = load_data(file_path)
    
    if data is not None:
        # Explore the data
        explore_data(data)
        
        # Preprocess the data
        data = preprocess_data(data)

        # Visualize a specific column (replace 'your_column_name' with an actual column name)
        visualize_data(data, 'your_column_name')

        # Train a model (replace 'target_column' with your actual target column name)
        train_model(data, 'target_column')

if __name__ == "__main__":
    main()