Importing libraries

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

Reading the file `MERGED_DATA.csv` into a dataframe

In [2]:
df = pd.read_csv('MERGED_DATA.csv')

In [None]:
pd.set_option('display.max_columns', None)
df

Computing the correlation matrix for selected features, and visualizing it as a heatmap.

In [None]:
correlation_matrix = df[['POST_RELEASE_PURCHASE', 'SUPPORTS_LTE',
       'CAMERA_FLASHLIGHT', 'SUPPORTS_MULTISIM', 'SUPPORTS_ESIM', 'SUPPORT_5G',
       'RAM_SIZE', 'SIM_COUNT', 'CPU_CORES', 'BATTERY_CAPACITY',
   ]].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

Calculate the average `POST_RELEASE_PURCHASE` purchase period for each `BRAND`.

In [4]:
brand_post_release = df.groupby('BRAND').agg({
    'POST_RELEASE_PURCHASE': 'mean'
}).reset_index()

Counting the number of devices for each `BRAND` and renameing the resulting column to `COUNTS`.

In [5]:
brand_device_id = df.groupby('BRAND').agg({
    'DEVICE_ID': 'count'
}).reset_index().rename(columns={'DEVICE_ID': 'COUNTS'})

Sorting values by descending order using `COUNTS` and `POST_RELEASE_PURCHASE`

In [6]:
brand_device_top15 = brand_device_id.sort_values('COUNTS',ascending=False).head(15)

In [7]:
brand_post_days_top15 = brand_post_release.sort_values('POST_RELEASE_PURCHASE', ascending=False)

Mergeing `brand_device_top15` and `brand_post_days_top15` on `BRAND` with an **inner join**.

In [8]:
merged_df = pd.merge(brand_device_top15, brand_post_days_top15, on='BRAND', how='inner')
merged_df['POST_RELEASE_PURCHASE'] = merged_df['POST_RELEASE_PURCHASE'].astype(int)

In [None]:
merged_df

Generatinge a bar chart displaying the average `POST_RELEASE_PURCHASE` for each `BRAND`.

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(brand_post_release['BRAND'], brand_post_release['POST_RELEASE_PURCHASE'], color='#FF2400')

plt.xlabel('Brand')
plt.ylabel('Average Post-Release Purchase')
plt.title('Average Post-Release Purchases by Brand')
plt.tight_layout()
plt.xticks([])  

plt.show()

In [None]:
df.head()

Filtering the DataFrame to include only rows where the `BRAND` is `Apple`.

In [16]:
apple_df = df[df['BRAND'] == 'Apple']

Aggregating `apple_df` by `MODEL` to calculate the average `POST_RELEASE_PURCHASE` and the earliest `RELEASE_DATE`.

In [29]:
apple_purchases = apple_df.groupby('MODEL').agg({
    'POST_RELEASE_PURCHASE' : 'mean',
    'RELEASE_DATE': 'min'
}).rename(columns={'POST_RELEASE_PURCHASE': 'POST_RELEASE_PURCHASE_AVG'})

apple_purchases['POST_RELEASE_PURCHASE_AVG'] = apple_purchases['POST_RELEASE_PURCHASE_AVG'].astype(int)
apple_purchases = apple_purchases.sort_values('POST_RELEASE_PURCHASE_AVG', ascending=False)

In [None]:
apple_purchases

Aggregating the dataframe by `ID` to find the earliest `START_DATE` and the latest `END_DATE`.

In [31]:
aggregated_df = df.groupby('ID').agg({
    'START_DATE': 'min',
    'END_DATE': 'max'
}).reset_index()

Aggregating the dataframe by `ID` to count the number of `DEVICE_ID`s per `ID`, renaming the count column to `NUMBER_OF_DEVICES`.


In [43]:
devicecount_df = df.groupby('ID').agg({
    'DEVICE_ID': 'count'
}).reset_index().rename(columns={'DEVICE_ID': 'NUMBER_OF_DEVICES'})


Converting `START_DATE` and `END_DATE` to datetime, then calculate `CUSTOMER_LIFETIME` as the day difference.


In [None]:
aggregated_df['START_DATE'] = pd.to_datetime(aggregated_df['START_DATE'])
aggregated_df['END_DATE']  = pd.to_datetime(aggregated_df['END_DATE'])
aggregated_df['CUSTOMER_LIFETIME'] = (aggregated_df['END_DATE'] - aggregated_df['START_DATE']).dt.days

aggregated_df

Mergeing `aggregated_df` and `devicecount_df` on `ID`, then computing `AVG_DAYS` as the average number of days per device.


In [None]:
merged_df = pd.merge(aggregated_df, devicecount_df, on='ID')
merged_df['AVG_DAYS'] = (merged_df['CUSTOMER_LIFETIME'] / merged_df['NUMBER_OF_DEVICES']).astype(int)
merged_df

Examining potential correlations between `DAYS_USED`, `RAM_SIZE`, and `BATTERY_CAPACITY`.

In [None]:
correlation_matrix = df[['RAM_SIZE', 'BATTERY_CAPACITY', 'DAYS_USED']].corr()
print(correlation_matrix)

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()