In [30]:
import pandas as pd
import os
from google.colab import drive

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/customer_signups.csv'

df = None
if not os.path.exists(file_path):
    print(f"File not found at: {file_path}")
else:
    try:
        df = pd.read_csv(file_path)
        print("Customer data loaded.")
    except Exception as e:
        print(f"Error loading CSV: {e}")

if df is not None:
    print("\n--- Data Cleaning Process ---")

    print("Initial Info:")
    print(df.info())
    print("\nInitial Missing Values:")
    print(df.isnull().sum())

    df_before_cleaning_count = df.shape[0]

    df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')
    df = df.dropna(subset=['signup_date'])

    df_before_duplicates = df.shape[0]
    df = df.drop_duplicates(subset='customer_id')
    duplicates_removed = df_before_duplicates - df.shape[0]

    df['plan_selected'] = df['plan_selected'].str.strip().str.title().replace({
        'Pro': 'Pro', 'Basic': 'Basic', 'Premium': 'Premium',
        'PRO': 'Pro', 'BASIC': 'Basic', 'PREMIUM': 'Premium'
    })

    df['gender'] = df['gender'].str.strip().str.title().replace({
        'M': 'Male', 'F': 'Female',
        'Male': 'Male', 'Female': 'Female', 'Other': 'Other', 'Not Specified': 'Not Specified'
    })

    df['region'] = df['region'].fillna('Unknown')
    df['email'] = df['email'].fillna('unknown@example.com')

    df['age'] = pd.to_numeric(df['age'], errors='coerce')
    median_age = df['age'].median()
    df['age'] = df['age'].fillna(median_age)

    df['marketing_opt_in'] = df['marketing_opt_in'].fillna('No').replace({'Yes': 'Yes', 'No': 'No', 'yes': 'Yes', 'no': 'No'})
    df['gender'] = df['gender'].fillna('Not Specified')
    df['name'] = df['name'].fillna('No Name Provided')

    print("\nData Cleaning Complete.")

    print("\n--- Data Quality Summary ---")
    print("Missing Values After Cleaning:")
    print(df.isnull().sum())
    print("\nPercentage Missing After Cleaning:")
    print((df.isnull().sum() / df.shape[0]) * 100)
    print(f"\nDuplicates Removed: {duplicates_removed}")

    print("\n--- Summary Outputs ---")
    df_weekly = df.set_index('signup_date').resample('W').size().reset_index(name='signups')
    print("\nSign-ups per Week (first 5 rows):")
    print(df_weekly.head())

    print("\nSign-ups by Source:")
    print(df['source'].value_counts())

    print("\nSign-ups by Region:")
    print(df['region'].value_counts())

    print("\nSign-ups by Plan Selected:")
    print(df['plan_selected'].value_counts())

    print("\nMarketing Opt-in Counts by Gender:")
    print(df[df['marketing_opt_in'] == 'Yes']['gender'].value_counts())

    print("\nAge Summary:")
    print(pd.Series({
        'Min Age': df['age'].min(),
        'Max Age': df['age'].max(),
        'Mean Age': df['age'].mean(),
        'Median Age': df['age'].median(),
        'Null Age Count': df['age'].isnull().sum()
    }))

    print("\n--- Cleaned DataFrame Head ---")
    print(df.head())
    print("\n--- Cleaned DataFrame Info ---")
    print(df.info())

else:
    print("\nCannot proceed: DataFrame not loaded.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Customer data loaded.

--- Data Cleaning Process ---
Initial Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   customer_id       300 non-null    object
 1   name              294 non-null    object
 2   email             266 non-null    object
 3   signup_date       298 non-null    object
 4   source            291 non-null    object
 5   region            270 non-null    object
 6   plan_selected     292 non-null    object
 7   marketing_opt_in  290 non-null    object
 8   age               288 non-null    object
 9   gender            292 non-null    object
dtypes: object(10)
memory usage: 23.6+ KB
None

Initial Missing Values:
customer_id          0
name                 6
email               34
signup

  df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')


In [35]:
import pandas as pd
import os

print("\n--- Optional Task: Support Tickets Analysis ---")

file_path_support_tickets = '/content/drive/MyDrive/support_tickets.csv'

df_support = None
if not os.path.exists(file_path_support_tickets):
    print(f"Support tickets file not found at: {file_path_support_tickets}")
    print("Please ensure 'support_tickets.csv' is in your Google Drive's root 'MyDrive' folder.")
else:
    try:
        df_support = pd.read_csv(file_path_support_tickets)
        print("Support tickets data loaded successfully.")
        print("\nInitial Support Tickets Info:")
        print(df_support.info())
        print("\nSupport Tickets Head:")
        print(df_support.head())

        df_support['customer_id'] = df_support['customer_id'].astype(str).str.strip().str.upper()
        print("\nCleaned support_tickets customer_id for merging.")

        merged_df = pd.merge(df, df_support, on='customer_id', how='left')
        print("\nMerged customer sign-ups with support tickets data.")
        print(merged_df.head())
        print(merged_df.info())

        support_tickets_per_plan = merged_df.groupby('plan_selected')['ticket_id'].count().reset_index(name='total_tickets_raised')

        users_per_plan = df['plan_selected'].value_counts().reset_index(name='total_users_on_plan')
        users_per_plan.columns = ['plan_selected', 'total_users_on_plan']

        plan_support_summary = pd.merge(support_tickets_per_plan, users_per_plan, on='plan_selected', how='left')

        plan_support_summary['likelihood_to_contact_support_pct'] = (
            plan_support_summary['total_tickets_raised'] / plan_support_summary['total_users_on_plan']
        ) * 100

        print("\nLikelihood to Contact Support by Plan (sorted by likelihood):")
        print(plan_support_summary.sort_values(by='likelihood_to_contact_support_pct', ascending=False).round(2))

    except Exception as e:
        print(f"An error occurred during support tickets analysis: {e}")


--- Optional Task: Support Tickets Analysis ---
Support tickets data loaded successfully.

Initial Support Tickets Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123 entries, 0 to 122
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ticket_id    123 non-null    object
 1   customer_id  123 non-null    object
 2   ticket_date  123 non-null    object
 3   issue_type   123 non-null    object
 4   resolved     123 non-null    object
dtypes: object(5)
memory usage: 4.9+ KB
None

Support Tickets Head:
   ticket_id customer_id ticket_date       issue_type resolved
0  TKT0000-1   CUST00203  2024-08-17          Billing      Yes
1  TKT0000-2   CUST00203  2024-07-22  Technical Error      Yes
2  TKT0000-3   CUST00203  2024-07-22            Other      Yes
3  TKT0001-1   CUST00266  2024-09-26    Account Setup      Yes
4  TKT0001-2   CUST00266  2024-10-09  Technical Error       No

Cleaned support_tickets customer_id for 