# Exploratory Data Analysis

Importing neceassary libraries and reading the file

In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('DATA_ML.csv', index_col=False)

In [3]:
end_date_max = df['END_DATE'].max()
end_date_max

'2024-07-12'

In [None]:
pd.set_option('display.max_columns', None)
del(df['Unnamed: 0'])
df.head()

Checking whether or not any customer purchased any 2 devices at the same date

In [5]:
duplicate_start_dates = df[df.duplicated(subset=['ID', 'DEVICE_ID', 'START_DATE'], keep=False)]
users_with_non_unique_start_dates = duplicate_start_dates['ID'].unique()

len(users_with_non_unique_start_dates)

0

Sorting values by user ID and START_DATE

In [6]:
sorted_df = df.sort_values(['ID',"START_DATE"])

In [None]:
sorted_df.head()

Average Brand Usage Duration

In [None]:
temp = sorted_df.groupby('BRAND')['DAYS_USED'].mean()
days_by_brand = pd.DataFrame(temp)
days_by_brand = days_by_brand.sort_values('DAYS_USED',ascending=False)
days_by_brand.head(10)

Calculating the total number of devices used per brand.

In [None]:
phones_per_brand = sorted_df.groupby('BRAND')['ID'].count().reset_index(name='TOTAL_COUNT')
phone_per_brand = pd.DataFrame(phones_per_brand)
phone_per_brand = phone_per_brand.sort_values('TOTAL_COUNT',ascending=False)
phone_per_brand.head(10)

Merging dataframes & Renaming columns

In [None]:
summary_df = pd.merge(phone_per_brand, days_by_brand, on='BRAND', how='inner')
summary_df.rename(columns={'DAYS_USED': 'MEAN_DAYS_USED'}, inplace=True)
summary_df.rename(columns={'TOTAL_COUNT': 'PHONE_COUNT'}, inplace=True)
summary_df['MEAN_DAYS_USED'] = summary_df['MEAN_DAYS_USED'].round(2)
summary_df.head(10)

Identifing and counting transitions between brands

In [None]:
temp = sorted_df
temp['NEXT_BRAND'] = temp.groupby('ID')['BRAND'].shift(-1)

temp = temp.dropna(subset=['NEXT_BRAND'])

temp['FROM_TO'] = temp['BRAND'] + ' to ' + temp['NEXT_BRAND']

brand_change_counts = temp.groupby(['FROM_TO']).size().reset_index(name='COUNT')

brand_change_counts[['FROM', 'TO']] = brand_change_counts['FROM_TO'].str.split(' to ', expand=True)

brand_change_df = brand_change_counts[['FROM', 'TO', 'COUNT']]

brand_change_df =  brand_change_df.sort_values('COUNT',ascending=False)

In [14]:
brand_change_df['CHANGE'] = brand_change_df['FROM'] + ' to ' + brand_change_df['TO']

In [None]:
brand_change_df = brand_change_df.sort_values('COUNT',ascending=False)
brand_change_df.head(15)

Data showing changing the device from 'Apple' to other brands

In [None]:
brand_change_df[brand_change_df['FROM'] == 'Apple'].head(8)

Bar plot visualizing chnages from one brand to another

In [None]:
plt.figure(figsize=(6, 3))
sns.set_palette("Set1")
sns.barplot(data=brand_change_df.head(15), x='FROM', y='COUNT', hue='TO')
plt.title('Phone Brand Changes')
plt.xlabel('From Brand') 
plt.ylabel('Count of Changes')
plt.xticks(rotation=45)
plt.legend(title='To Brand') 
plt.show()


Deleting the column 'NEXT_BRAND'

In [18]:
sorted_df= sorted_df.drop('NEXT_BRAND', axis=1)


In [None]:
sorted_df.head(12)

Normalizing the frequency of each 'OS_CODE' & mapping these frequencies back to the 'OS_CODE' column

In [20]:
frequency_map = sorted_df['OS_CODE'].value_counts(normalize=True)

sorted_df['OS_CODE'] = sorted_df['OS_CODE'].map(frequency_map)

In [None]:
sorted_df.head()

Droping columns

In [22]:
sorted_df = sorted_df.drop(['BRAND', 'MODEL'], axis=1)

In [None]:
sorted_df['BODY_TYPE'].value_counts()

Normalizing the frequency of each 'BODY_TYPE' & mapping these frequencies back to the 'BODY_TYPE' column

In [24]:
frequency_map2 = sorted_df['BODY_TYPE'].value_counts(normalize=True)

sorted_df['BODY_TYPE'] = sorted_df['BODY_TYPE'].map(frequency_map2)

In [25]:
# sorted_df = sorted_df.drop(['START_DATE', 'END_DATE','RELEASE_DATE'], axis=1)

Normalizing the frequency of each 'OS_VENDOR_CODE' & mapping these frequencies back to the 'OS_VENDOR_CODE' column

In [26]:
frequency_map3 = sorted_df['OS_VENDOR_CODE'].value_counts(normalize=True)

sorted_df['OS_VENDOR_CODE'] = sorted_df['OS_VENDOR_CODE'].map(frequency_map3)

Create a new column 'IS_CHANGED' that flags whether 'END_DATE' is not null and before the current date.

In [27]:
current_date = end_date_max

sorted_df['IS_CHANGED'] = sorted_df['END_DATE'].apply(lambda x: 1 if pd.notna(x) and x < current_date else 0)

In [None]:
sorted_df[sorted_df['IS_CHANGED'] == 0].head()

Dropping a column

In [29]:
sorted_df = sorted_df.drop(['OS_VENDOR_CODE'], axis=1)

In [30]:
sorted_df['BATTERY_CAPACITY'] = sorted_df['BATTERY_CAPACITY'].astype(int)

In [31]:
sorted_df['PIXEL_DENSITY'] = sorted_df['PIXEL_DENSITY'].astype(int)

In [None]:
sorted_df['OS_TYPE_CODE'].value_counts()

Dropping a column

In [33]:
sorted_df = sorted_df.drop('OS_TYPE_CODE', axis=1)

In [None]:
sorted_df['MULTISIM_MODE_CODE'].value_counts()

In [None]:
sorted_df.head()