In [None]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv"
data = pd.read_csv(url)

# Challenge 1: Understanding the data

# Step 1: Identify dimensions
dimensions = data.shape
print("Dimensions of the dataset:", dimensions)

# Step 2: Determine data types
data_types = data.dtypes
print("Data types of each column:\n", data_types)

# Suggestion for fixing data types if needed
# Example: data['Customer Lifetime Value'] = data['Customer Lifetime Value'].astype(float)

# Step 3: Unique values and identifying categorical columns
unique_values = {col: data[col].nunique() for col in data.columns}
print("Number of unique values per column:\n", unique_values)

categorical_columns = data.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)

# Describe unique values of each categorical column
for col in categorical_columns:
    print(f"Unique values in {col}:\n", data[col].unique())

# Range of values for numerical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
print("Range of values for numerical columns:")
for col in numerical_columns:
    print(f"{col}: Min {data[col].min()}, Max {data[col].max()}")

# Step 4: Summary statistics for numerical columns
summary_statistics = data[numerical_columns].describe()
print("Summary statistics for numerical columns:\n", summary_statistics)

# Step 5: Summary statistics for categorical columns
categorical_summary = data[categorical_columns].describe()
print("Summary statistics for categorical columns:\n", categorical_summary)

# Challenge 2: Analyzing the data

# Exercise 1: Top 5 less common customer locations
less_common_locations = data['ST'].value_counts(ascending=True)
top_5_less_common_locations = less_common_locations.head(5)
print("Top 5 less common customer locations:\n", top_5_less_common_locations)

# Exercise 2: Total number of policies sold for each type of policy
policies_sold = data['Policy Type'].value_counts()
highest_policy_type = policies_sold.idxmax(), policies_sold.max()
print("Policy type with the highest number of policies sold:", highest_policy_type)

# Exercise 3: Comparison of average income between policy types
personal_auto = data.loc[data['Policy Type'] == 'Personal Auto']
corporate_auto = data.loc[data['Policy Type'] == 'Corporate Auto']

average_income_personal_auto = personal_auto['Income'].mean()
average_income_corporate_auto = corporate_auto['Income'].mean()

print(f"Average income for Personal Auto: {average_income_personal_auto}")
print(f"Average income for Corporate Auto: {average_income_corporate_auto}")