In [None]:
#At the first step i want to read data file from csv file an take a look at the data, consider that i use google colab and then copy my codes an results here
import pandas as pd
import numpy as np
from google.colab import files
import seaborn as sns
import matplotlib.pyplot as plt

# Upload CSV file
uploaded = files.upload()

# Get the filename from device to google colab this line of code could be replaced by a file path address to read:
filename = list(uploaded.keys())[0]

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(filename)

# Display basic info
print("Data Overview:")
print(df.info())

# Convert date columns to datetime format
date_columns = ['Registration Date', 'Date of Premium Subscription']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Check for missing values and i find out missing data is for free accounts because they have no date of premiumm subs so we can not drop them because whole data of free account will be droped
print("\nMissing Values:\n", df.isnull().sum())

df = df.copy()  # Prevents chained indexing issues

# Convert categorical variables to category dtype
categorical_cols = ['Current Subscription Status', 'Primary Device', 'Age Group', 'Country', 'Favorite Genre', 'Engagement Trend']
for col in categorical_cols:
    df.loc[:, col] = df[col].astype("category")  # Explicitly using .loc to avoid warnings

# Display first few rows
df.head(10)


In [None]:
#now a glance of whole data and category
# Define color mapping
custom_palette = {"Premium": "red", "Free": "blue"}

# Define category order explicitly
category_order = ["Free", "Premium"]

# Plot with explicit category order
plt.figure(figsize=(8, 5))
ax = sns.countplot(
    x="Current Subscription Status",
    data=df,
    hue="Current Subscription Status",
    palette=custom_palette,
    order=category_order,  # Ensure both "Free" and "Premium" appear
    legend=False
)

# Add percentage annotations
total = len(df)  # Total number of users
for p in ax.patches:
    height = p.get_height()
    percentage = (height / total) * 100  # Calculate percentage
    ax.annotate(f'{percentage:.1f}%',  # Format percentage to 1 decimal
                (p.get_x() + p.get_width() / 2., height),  # Position at the center of the bar
                ha='center', va='bottom',  # Align center
                fontsize=12, fontweight='bold', color='black')
plt.title("Subscription Status Distribution")
plt.xlabel("Subscription Type")
plt.ylabel("Count")
plt.show()

In [None]:
#as we can see in the plot, the majority of users are premium users they are near 74% of all users
#next step in to approach data and exploratory data analysis to understand the characteristics of free and premium users

In [None]:

# Define the metrics for boxplot analysis
metrics = ['Number of Videos Watched', 'Total Watch Time', 'Number of Customer Support Interactions', 'Net Promoter Score', 'Social Shares']

plt.figure(figsize=(12, 6))

# Loop through the metrics and create a subplot for each
for i, col in enumerate(metrics, 1):
    plt.subplot(2, 3, i)
    
    # Fix the boxplot by assigning 'hue' and using the palette
    sns.boxplot(x="Current Subscription Status", y=col, data=df, hue="Current Subscription Status", palette="coolwarm", legend=False)
    
    plt.title(f"{col} by Subscription Status")
# Adjust the spacing between subplots
plt.subplots_adjust(hspace=0.9)
plt.tight_layout()
plt.show()


In [None]:

# here i decided to plot a correlation matrix for a better analyse so we would ensure that 'Current Subscription Status' is encoded correctly
df_encoded = df.copy()

# Label encode 'Current Subscription Status' (Premium=1, Free=0)
df_encoded['Numeric Subscription Status'] = df_encoded['Current Subscription Status'].apply(lambda x: 1 if x == "Premium" else 0)

# Encode 'Engagement Trend'
df_encoded['Numeric Engagement Trend'] = df_encoded['Engagement Trend'].apply(lambda x: 0 if x == "Decreasing" else (0.5 if x == "Stable" else 1))

# Drop 'User ID' and 'Days to Convert' only if they exist because they have no meaning in this analyse here
columns_to_drop = ['User ID', 'Days to Convert']
df_encoded = df_encoded.drop(columns=[col for col in columns_to_drop if col in df_encoded.columns])

# Select only numeric columns for correlation
df_numeric = df_encoded.select_dtypes(include=['number'])

# Calculate the correlation matrix
correlation_matrix = df_numeric.corr()

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix with Encoded Variables")
plt.show()

#when we look at the correlation matrix we understand that there is a High Positive Correlation between Total Watch Time (0.94) and Number of Videos Watched this makes sense, as users who watch more videos tend to have higher watch time. and also a Negative correlation between net promoter score and number of customer support interaction and this is reasonable too.
#And we can see Subscription Numeris Status that is important for us has Moderate Correlations with Net promete score and a Weak Correlation with Numeric Engagement Trend, Social Shares,number of customer support, total watch and number of watches !


In [None]:
# now i want to Analyze Premium users by Country, Age Group, and Genre
categorical_columns = ['Country', 'Age Group', 'Favorite Genre']

for col in categorical_columns:
    # Get the count of users for each category and status (Premium/Free)
    user_count = pd.crosstab(df[col], df['Current Subscription Status'])
    
    # Calculate the percentage of Premium users out of the total users (Premium + Free) in each segment
    user_count['Premium %'] = (user_count['Premium'] / user_count.sum(axis=1)) * 100
     # Sort the values from least to greatest before plotting to compare easier
    user_count = user_count.sort_values(by="Premium %", ascending=True)
    # Plot the results showing only the Premium percentage
    user_count['Premium %'].plot(kind='bar', color='purple', alpha=0.8, figsize=(8, 5))
    
    plt.title(f"Percentage of Premium Users by {col}")
    plt.ylabel("Percentage of Premium Users")
    plt.xticks(rotation=45)
    plt.show()

## there is a tiny relation between age, country and genre and having a premium account numbers are very close  but perhaps we can use this part of anlysis in A/B test in future!


In [None]:

# For another Approach to Data Filter only Premium users
df_premium = df[df['Current Subscription Status'] == 'Premium']

# Define categories to analyze
categories = ['Favorite Genre', 'Country', 'Age Group' , 'Primary Device', 'Engagement Trend']

# Function to calculate percentages and plot a pie chart with color scaling
def plot_pie_chart(column):
    # Calculate the percentage of each category in the Premium population
    value_counts = df_premium[column].value_counts(normalize=True) * 100
    
    # Sort values from greatest to least (important for color gradient)
    sorted_values = value_counts.sort_values(ascending=False)

    # Generate a color gradient based on the number of unique values
    colors = sns.color_palette("coolwarm", len(sorted_values))

    # Plot Pie Chart
    plt.figure(figsize=(7, 7))
    plt.pie(sorted_values, labels=sorted_values.index, autopct='%1.1f%%', 
            colors=colors, startangle=140)

    # Title
    plt.title(f"Distribution of {column} in Premium Users", fontsize=14, fontweight='bold')
    plt.show()

# Step 4: Generate plots for each category
for cat in categories:
    plot_pie_chart(cat)

#as we see again numbers are very close but still we can use them in A/B test because we can see that group ages +65 with 16.8% and 18-24 and 25-34 and 35-44 with 16.7% are the greatest part of premium among other ages and documentary with 17.1% and Action with 16.8% and Sci-Fic with 16.7% are the biggest among genres

In [None]:
# Calculate days until conversion for premium users
df['Days to Convert'] = (df['Date of Premium Subscription'] - df['Registration Date']).dt.days

# Filter only premium users
df_premium = df[df['Current Subscription Status'] == 'Premium']

# Get min and max values dynamically
min_days = df_premium['Days to Convert'].min()
max_days = df_premium['Days to Convert'].max()

# Plot histogram (without KDE)
plt.figure(figsize=(8, 5))
sns.histplot(df_premium['Days to Convert'], bins=30, color="blue", stat="density", alpha=0.6)

# Plot KDE separately with a dynamic range
sns.kdeplot(df_premium['Days to Convert'], color="red", linewidth=2, clip=(min_days, max_days))

plt.xlim(min_days, max_days)  # Dynamically set x-axis limits
plt.title("Distribution of Days to Convert to Premium")
plt.xlabel("Days from Registration to Subscription")
plt.show()

#as plot shows obviously we have a high density at the first days of registration and then it becomes stable and then decreasing.

In [None]:
# #A/B Test Design to Improve Conversion Rates considering that we understand from the data that we have a more engagement at the start days of registration and the most ages that have premium are +65 and 25 - 44 years and a moderate correlation between promoter score and subscription
# To evaluate the effectiveness of targeted interventions aimed at increasing conversion rates from free to premium subscriptions, focusing on engagement during the early registration days, age segments (25-44 & 65+), and Net Promoter Score (NPS).

#  Hypothesis
# H0 (Null Hypothesis): The intervention does not significantly impact conversion rates.
# H1 (Alternative Hypothesis): The intervention increases conversion rates among free users.
# Early Engagement Incentives:
# Personalized email/push notifications within the first 7 days of registration.
# Special discounts or free trials for users who show high engagement early on.
# Age-Specific Messaging:
# For 25-44: Highlight exclusive features like mobile-friendly content, binge-watching recommendations.
# For 65+: Emphasize ease of use, customer support, and curated content.
# Leverage Net Promoter Score (NPS):
# Users with high NPS (7-10): Offer referral rewards for inviting others.
# Users with low NPS (0-6): Collect feedback and provide limited-time premium perks to improve their experience.


In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Filter only Free users for A/B test
df_free = df[df['Current Subscription Status'] == 'Free'].copy()

# Randomly assign users to Control (A) and Test (B) groups (50-50 split)
np.random.seed(42)
df_free['AB_Group'] = np.random.choice(['Control', 'Test'], size=len(df_free), p=[0.5, 0.5])

# Define base conversion rates
base_conversion_rate = 0.05  # 5% baseline conversion for Control group

# Increase conversion rate in the Test group based on targeted interventions
def simulate_conversion(row):
    if row['AB_Group'] == 'Control':
        return np.random.rand() < base_conversion_rate  # 5% chance
    else:  # Test Group with interventions
        prob = base_conversion_rate
        if row['Days to Convert'] <= 7:
            prob += 0.05  # +5% for early engagement users
        if row['Age Group'] in ['25-44', '65+']:
            prob += 0.03  # +3% for key age segments
        if row['Net Promoter Score'] >= 7:
            prob += 0.02  # +2% if they are a promoter
        return np.random.rand() < prob

# Apply simulated conversion
df_free['Converted'] = df_free.apply(simulate_conversion, axis=1)


In [None]:
# Count conversions in each group
conversion_counts = df_free.groupby('AB_Group')['Converted'].sum()
total_counts = df_free['AB_Group'].value_counts()

# Create a contingency table
contingency_table = pd.DataFrame({'Converted': conversion_counts, 'Total': total_counts})
contingency_table['Non-Converted'] = contingency_table['Total'] - contingency_table['Converted']
contingency_table = contingency_table[['Converted', 'Non-Converted']]

# Perform Chi-square test
chi2, p_value, _, _ = stats.chi2_contingency(contingency_table)

# Print results
print("Conversion Rates:")
print(contingency_table)
print(f"\nChi-square Test Statistic: {chi2:.4f}, p-value: {p_value:.4f}")

# Visualizing conversion rates
plt.figure(figsize=(6, 4))
sns.barplot(x=contingency_table.index, y=conversion_counts / total_counts, palette=['blue', 'red'])
plt.title("A/B Test Results: Conversion Rate")
plt.ylabel("Conversion Rate")
plt.show()


In [None]:
# 2. Chi-Square Test & p-Value
# Chi-square statistic: 3.0043
# p-value: 0.0830
# The p-value (0.0830) is greater than 0.05, meaning the difference is not statistically significant at the 5% level.
# This suggests that the increase in conversion rate might be due to random chance rather than a real effect of the intervention.
# In simple terms: We cannot confidently say that the intervention caused the increase in conversions.
# For Next Steps i would add other parameters to my test :

In [None]:
#ok lets add these points that 25.6% premium use smart tv and 25.5% use desktop and the most favorite genre among premium are documentary 17.1% action 16.8% and science fiction 16.7%

# Simulating a dataset (Replace with actual A/B test data)
np.random.seed(42)

# Creating a sample dataset with AB Groups and Conversion Data
df_ab_test = pd.DataFrame({
    'User_ID': range(1, 3800),  # Simulating 3800 users
    'AB_Group': np.random.choice(['Control', 'Test'], size=3799, p=[0.5, 0.5]),
    'Device_Type': np.random.choice(['Smart TV', 'Desktop', 'Mobile', 'Tablet'], size=3799, p=[0.26, 0.25, 0.30, 0.19]),
    'Favorite_Genre': np.random.choice(['Documentary', 'Action', 'Science Fiction', 'Drama', 'Comedy'], size=3799, p=[0.17, 0.168, 0.162, 0.25, 0.25]),
})

# Simulating conversion based on targeted intervention
df_ab_test['Converted'] = df_ab_test.apply(lambda row: 
    np.random.choice([1, 0], p=[0.12, 0.88]) if row['AB_Group'] == 'Control' else  # 12% base conversion in Control
    np.random.choice([1, 0], p=[0.18, 0.82]) if row['Device_Type'] in ['Smart TV', 'Desktop'] or row['Favorite_Genre'] in ['Documentary', 'Action', 'Science Fiction']  
    else np.random.choice([1, 0], p=[0.14, 0.86]),  # 18% for SmartTV/Desktop/Targeted Genres, 14% for others
    axis=1)

# Conversion rates
conversion_rates = df_ab_test.groupby('AB_Group')['Converted'].agg(['sum', 'count'])
conversion_rates['Conversion Rate'] = conversion_rates['sum'] / conversion_rates['count']

print("Conversion Rates:")
print(conversion_rates)

# Performing a chi-square test
contingency_table = pd.crosstab(df_ab_test['AB_Group'], df_ab_test['Converted'])
chi2, p, _, _ = stats.chi2_contingency(contingency_table)

print(f"\nChi-square Test Statistic: {chi2:.4f}, p-value: {p:.4f}")

# Visualizing the results
plt.figure(figsize=(8, 5))
conversion_counts = contingency_table[1]  # Number of converted users
total_counts = contingency_table.sum(axis=1)  # Total users in each group
conversion_rates_plot = conversion_counts / total_counts

ax = sns.barplot(x=contingency_table.index, y=conversion_rates_plot, palette=['blue', 'red'])
plt.title("A/B Test Results: Conversion Rate")
plt.ylabel("Conversion Rate")
plt.xlabel("A/B Group")

# Adding annotations (tooltips) for better visualization
for i, rate in enumerate(conversion_rates_plot):
    ax.text(i, rate + 0.005, f"{rate:.1%}", ha='center', fontsize=12, fontweight='bold')

plt.show()


# as results we have:
# Control Group: 11.76% (222 / 1888)
# Test Group: 18.21% (348 / 1911)
#  The Test group has a higher conversion rate than the Control group (18.21% vs. 11.76%), meaning the intervention (Smart TV, Desktop users & Favorite Genres) improved conversion.
# Chi-square statistic: 30.4962 (measures how different the groups are)
# p-value: 0.0000 (very small, < 0.05)
# Since p-value is close to 0, we reject the null hypothesis.
# This means the intervention had a statistically significant impact on conversion rates.

# so this is an improvement compare to the previous A/B test because the test group shows a strong improvement (18.21% vs. 11.76%), and the difference is statistically significant.

# Recommendation: Run the test for a longer period to check if conversion rates remain high or we can Offer limited-time premium trials within the first 7 days, Send personalized email sequences to new users, highlighting benefits.Introduce exclusive onboarding incentives (e.g., "First-week premium deal")with Priority Level: Medium-High → Can significantly impact early conversion rates