In [None]:
import pandas as pd
import numpy as np

# Load the CSV file
try:
    df = pd.read_csv('VeloCityX.csv')
except FileNotFoundError:
    print("Error: The file 'VeloCityX.csv' was not found.")
    exit()

# Display the first few rows to inspect the data
print("Original Data:\n", df.head())

# Check for missing values in all columns
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Drop rows with missing values (NaN)
df_clean = df.dropna()

# Remove duplicates across all columns
df_clean = df_clean.drop_duplicates()

# Check for inconsistent data types for all columns
print("\nData Types Before Conversion:")
print(df_clean.dtypes)

# Convert relevant columns to numeric, coercing errors to NaN
numeric_columns = ['Fan Challenges Completed', 'Predictive Accuracy (%)', 
                   'Virtual Merchandise Purchases', 'Sponsorship Interactions (Ad Clicks)', 
                   'Time on Live 360 (mins)', 'Real-Time Chat Activity (Messages Sent)']

for col in numeric_columns:
    df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# Re-check data types after conversion
print("\nData Types After Conversion:")
print(df_clean.dtypes)

# Check for outliers in all numeric columns using the IQR method
print("\nSummary Statistics (Before Removing Outliers):")
print(df_clean.describe())

for col in numeric_columns:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    df_clean = df_clean[~((df_clean[col] < (Q1 - 1.5 * IQR)) | (df_clean[col] > (Q3 + 1.5 * IQR)))]

# Summary statistics after outlier removal
print("\nSummary Statistics (After Removing Outliers):")
print(df_clean.describe())

# Check if there are any duplicates left
print("\nDuplicate Rows Left (Should be 0):")
print(df_clean.duplicated().sum())

# Save the cleaned data to a new CSV file
df_clean.to_csv('cleaned_data.csv', index=False)

print("\nData Cleaning Complete! The cleaned data has been saved as 'cleaned_data.csv'.")

In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns


data = pd.read_csv('cleaned_data.csv')

# Step 1: Calculate the average for each column (excluding 'User ID')
column_means = data.drop(columns=['User ID']).mean()

# Step 2: Get the top 10 users based on Virtual Merchandise Purchases (show only User ID and Purchase count)
top_10_users = data[['User ID', 'Virtual Merchandise Purchases']].nlargest(10, 'Virtual Merchandise Purchases')
print("Top 10 Users Based on Virtual Merchandise Purchases:")
print(top_10_users)

# Step 3: Compare top 10 users' values to the averages and show where they exceeded the average
columns_to_compare = ['Fan Challenges Completed', 'Predictive Accuracy (%)', 'Sponsorship Interactions (Ad Clicks)',
                      'Time on Live 360 (mins)', 'Real-Time Chat Activity (Messages Sent)']

# Compare each top 10 user to the averages, show if they are above or below for each column
comparison_result = top_10_users[['User ID']].copy()
for col in columns_to_compare:
    comparison_result[col] = data.loc[top_10_users.index, col] > column_means[col]
    comparison_result[col] = comparison_result[col].apply(lambda x: 'Above Average' if x else 'Below Average')

# Step 4: Display results where users exceeded the average
print("\nTop 10 Users and Columns Where They Exceeded the Average:")
print(comparison_result)

# Step 5: Count the number of top 10 users exceeding the average for each column
exceed_counts = (data.loc[top_10_users.index, columns_to_compare] > column_means[columns_to_compare]).sum()

# Step 6: Identify columns where more than 50% of the top 10 users exceeded the average
high_impact_columns = exceed_counts[exceed_counts > len(top_10_users) / 2].index

# Step 7: Print the columns where more than 50% of the top 10 users exceeded the average
print("\nColumns where More than 50% of Top 10 Users were Above Average:")
print(high_impact_columns)

# Step 8: Perform a regression analysis to check how each column affects Virtual Merchandise Purchases
# Define the independent variables (excluding 'User ID' and 'Virtual Merchandise Purchases') and the dependent variable
X = data[columns_to_compare]
y = data['Virtual Merchandise Purchases']

# Add a constant to the independent variables (for the intercept)
X = sm.add_constant(X)

# Perform the regression analysis
model = sm.OLS(y, X).fit()

# Print the results of the regression analysis
print("\nOLS Regression Results:")
print(model.summary())


print("I was hoping there would be a great coorelation between my average analysis and using the regression analysis to see which columns are most impactful. However, the results are not as expected.")
print("this means there is need for more  analysis to understand the relationship between the columns and the dependent variable.  I will need to perform more. ")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load the dataset
data = pd.read_csv('cleaned_data.csv')

# List of columns to check correlation with 'Virtual Merchandise Purchases'
columns_to_check = ['Fan Challenges Completed', 'Predictive Accuracy (%)', 
                    'Sponsorship Interactions (Ad Clicks)', 'Time on Live 360 (mins)', 
                    'Real-Time Chat Activity (Messages Sent)']

# Iterate over the columns and calculate correlation with 'Virtual Merchandise Purchases'
for column in columns_to_check:
    correlation = data[column].corr(data['Virtual Merchandise Purchases'])
    print(f'Correlation between {column} and Virtual Merchandise Purchases: {correlation}')
    
    # Plot the figure
    plt.figure(figsize=(8,6))
    sns.scatterplot(x=column, y='Virtual Merchandise Purchases', data=data)
    plt.title(f'{column} vs Virtual Merchandise Purchases')

    # Save the plot as a PNG file
    file_name = f'{column.replace(" ", "_").lower()}_vs_purchases.png'
    plt.savefig(file_name)
    print(f"Plot saved as '{file_name}'")


print("Using Scatterplot I was able to conclude that Fan Challenges Completed and Sponsorship Interaction has the most direct correlation with Virtual Merchandise Purchases. This suggests that users who are more engaged those activities are more likely to make purchases. ")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('cleaned_data.csv')

# Select relevant columns for clustering
columns_to_use = ['Fan Challenges Completed', 'Predictive Accuracy (%)', 
                  'Sponsorship Interactions (Ad Clicks)', 'Time on Live 360 (mins)', 
                  'Real-Time Chat Activity (Messages Sent)', 'Virtual Merchandise Purchases']

# Step 1: Data Preprocessing (Scaling the data)
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data[columns_to_use])

# Step 2: Apply K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Trying 3 clusters (you can try more)
data['Cluster'] = kmeans.fit_predict(data_scaled)

# Step 3: Visualize the Clusters for Each Column and Save as PNG files
for column in columns_to_use[:-1]:  # Exclude 'Virtual Merchandise Purchases' for plotting
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=column, y='Virtual Merchandise Purchases', hue='Cluster', data=data, palette='Set2')
    plt.title(f'K-Means Clustering of Users Based on {column}')
    plt.xlabel(column)
    plt.ylabel('Virtual Merchandise Purchases')
    
    # Save the plot as a PNG file
    file_name = f'{column.replace(" ", "_").lower()}_vs_purchases K-Means.png'
    plt.savefig(file_name)
    plt.close()  # Close the figure to free up memory
    print(f"Plot saved as '{file_name}'")
    print('The combination of the clustering  results and the plots will help you understand the behavior of your users. Which lets us understand that the user who interacts with fan challenges completed and Sponsorship Interaction would be more likely to purchase merchandise. Which approves our hypothesis also.') 


