# Capstone 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def clean_data(df):
    # Drop duplicated rows
    df.drop_duplicates(inplace=True)
    
    # Fill NaN values with column means (for numeric columns)
    numeric_columns = df.select_dtypes(include=['number']).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
    
    # Fill NaN values with most frequent value (for non-numeric columns)
    non_numeric_columns = df.select_dtypes(exclude=['number']).columns
    for col in non_numeric_columns:
        most_frequent_value = df[col].mode()[0]
        df[col].fillna(most_frequent_value, inplace=True)

def visualize_data(df, column_name):
    try:
        column_data = df[column_name]
        if pd.api.types.is_numeric_dtype(column_data):
            xzx = column_data.value_counts()
            
            # Bar chart
            plt.bar(xzx.index, xzx.values)
            plt.grid(color='gray', linestyle='--', linewidth=0.5)
            plt.title(f'Bar Plot for {column_name}')
            plt.show()
            
            # Histogram
            plt.hist(column_data, bins=20, edgecolor='black', alpha=0.7)
            plt.xlabel(column_name)
            plt.ylabel('Frequency')
            plt.title(f'Histogram for {column_name}')
            plt.grid(color='gray', linestyle='--', linewidth=0.5)
            plt.show()

            # Pie chart
            plt.pie(xzx, labels=xzx.index, autopct='%1.1f%%', startangle=140)
            plt.axis('equal')
            plt.title(f'Pie Chart for {column_name}')
            plt.show()

            # Scatter plot
            plt.scatter(xzx.index, xzx.values)
            plt.grid(color='orange', linestyle='--', linewidth=0.5)
            plt.title(f'Scatter Plot for {column_name}')
            plt.show()

            # Box plot
            plt.boxplot(xzx)
            plt.title(f'Box Plot for {column_name}')
            plt.show()

        
        else:
            value_counts = column_data.value_counts()

            # Bar chart
            plt.bar(value_counts.index, value_counts.values)
            plt.grid(color='green', linestyle='--', linewidth=0.5)
            plt.title(f'Bar Plot for {column_name}')
            plt.show()

            # Pie chart
            plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=140)
            plt.axis('equal')
            plt.title(f'Pie Chart for {column_name}')
            plt.show()
    except KeyError:
        print("Column not found in the dataset.")

In [None]:
while True:
    file_path = input("Enter the file path:\n")
    try:
        if file_path.lower().endswith(".csv"):
            df = pd.read_csv(file_path)
            break
        elif file_path.lower().endswith(".xlsx"):
            df = pd.read_excel(file_path)
            break
        else:
            print("Unsupported file format. Please enter a .csv or .xlsx file.")
    except Exception as e:
        print("Error:", e)
        print("Can't find such a file with this name or extension.")

clean_data(df)
print("Data has been cleaned and processed.\n")
print(df)

print('\n\n\n\n')

print("Here is a short description for your data :\n")
print(df.describe())
print('\n\n\n\n')

print("Here are the columns in the dataset:\n")
print(df.columns)

while True:
    column_to_visualize = input("Enter the column name you want to visualize:\n")
    visualize_data(df, column_to_visualize) 
    
    continue_or_not = input("Do you want to visualize another column? (yes/no): ")
    if continue_or_not.lower() != 'yes':
        break