In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sf_df = pd.read_csv(r'C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\WhatFix Usage\SF_Users.csv')
sf_df

In [None]:
wf_df = pd.read_csv(r'C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\WhatFix Usage\WhatfixUsers.csv')
wf_df

### Step 1: Data Cleaning Phase

In [None]:
#1a: Checking general info
sf_df.info()
wf_df.info()

In [None]:
#1b: Changing 'Created Date' to Date
sf_df['Created Date'] = pd.to_datetime(sf_df['Created Date'])

### Step 2: Data Analysis Phase

In [None]:
#Question 1: What's the number of employees who are Non-Whatfix Users vs Whatfix Users?
usertype = sf_df.groupby('Whatfix User?').size().reset_index(name='Total Employees')
usertype

In [None]:
#Question 1 Visual:
usertype = sf_df.groupby('Whatfix User?').size().reset_index(name='Total Employees')
plt.figure(figsize=(5, 5))
plt.pie(
    usertype['Total Employees'], 
    labels=usertype['Whatfix User?'], 
    autopct='%1.1f%%', 
    startangle=90, 
    wedgeprops={'edgecolor': 'black'}
)
plt.title('Distribution of Employees by User Type')
plt.axis('equal')
plt.show()

In [None]:
#Question 2: What's the breakdown of Departments by Whatfix vs Non-Whatfix Users, and their respective percentages?
pd.set_option('display.max.rows', None)
dep = sf_df.groupby(['Whatfix User?', 'Primary Department: Account Name']).size().reset_index(name='Total')
dep['Percent'] = dep['Total'] / dep.groupby('Primary Department: Account Name')['Total'].transform('sum') * 100
dep['Percent'] = dep['Percent'].round(1)
dep = dep.sort_values(['Primary Department: Account Name', 'Total'], ascending=[True, False]).reset_index(drop=True)
dep

In [None]:
#Question 2 Visuals: The top bar graph shows the Total Values while the bottom one shows the percentages.
#You can also adjust the number of bars displayed to see Top N. (All this info applies to the next couple visuals!)
dep = sf_df.groupby(['Whatfix User?', 'Primary Department: Account Name']).size().reset_index(name='Total')
dep['Percent'] = dep['Total'] / dep.groupby('Primary Department: Account Name')['Total'].transform('sum') * 100
dep['Percent'] = dep['Percent'].round(1)
dep = dep.sort_values(['Primary Department: Account Name', 'Total'], ascending=[True, False]).reset_index(drop=True)
pivot_df_total = dep.pivot(index='Primary Department: Account Name', columns='Whatfix User?', values='Total').fillna(0)
pivot_df_total['Total Count'] = pivot_df_total.sum(axis=1)
top_n = 20  # Adjust as needed
pivot_df_total_top = pivot_df_total.sort_values('Total Count', ascending=False).head(top_n)
sorted_index = pivot_df_total_top.index
pivot_df_total_top = pivot_df_total_top.drop(columns='Total Count')
pivot_df_percent = dep.pivot(index='Primary Department: Account Name', columns='Whatfix User?', values='Percent').fillna(0)
pivot_df_percent_top = pivot_df_percent.loc[sorted_index]

fig, ax1 = plt.subplots(figsize=(12, 6))
pivot_df_total_top.plot(kind='bar', stacked=True, ax=ax1, color=['#1f77b4', '#ff7f0e'])
ax1.set_title(f'Top {top_n} Departments by Whatfix Users (Total)')
ax1.set_xlabel('Primary Department: Account Name')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
ax1.legend(title='Whatfix User?')
ax1.yaxis.set_visible(False)
for container in ax1.containers:
    ax1.bar_label(container, label_type='center', fontsize=10, color='black')

fig, ax2 = plt.subplots(figsize=(12, 6))
pivot_df_percent_top.plot(kind='bar', stacked=True, ax=ax2, color=['#1f77b4', '#ff7f0e'])
ax2.set_title(f'Top {top_n} Departments by Whatfix User Percentage')
ax2.set_xlabel('Primary Department: Account Name')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
ax2.legend(title='Whatfix User?')
ax2.yaxis.set_visible(False)
for container in ax2.containers:
    ax2.bar_label(container, labels=[f'{val:.1f}%' if val > 0 else '' for val in container.datavalues], label_type='center', fontsize=7, color='black')
plt.tight_layout()
plt.show()

In [None]:
#Question 3: What's the breakdown of Profile Names by Whatfix vs Non-Whatfix Users, and their respective percentages?
profile = sf_df.groupby(['Whatfix User?', 'User: Profile: Name']).size().reset_index(name='Total')
profile['Percent'] = (profile['Total'] / profile.groupby('User: Profile: Name')['Total'].transform('sum')) * 100
profile['Percent'] = profile['Percent'].round(1)
profile = profile.sort_values(['User: Profile: Name', 'Whatfix User?'], ascending=[True, False]).reset_index(drop=True)
profile

In [None]:
#Question 3 Visuals: 
profile = sf_df.groupby(['Whatfix User?', 'User: Profile: Name']).size().reset_index(name='Total')
profile['Percent'] = (profile['Total'] / profile.groupby('User: Profile: Name')['Total'].transform('sum')) * 100
profile['Percent'] = profile['Percent'].round(1)
profile = profile.sort_values(['User: Profile: Name', 'Total'], ascending=[True, False]).reset_index(drop=True)
pivot_df_total = profile.pivot(index='User: Profile: Name', columns='Whatfix User?', values='Total').fillna(0)
pivot_df_total['Total Count'] = pivot_df_total.sum(axis=1)
top_n = 10
pivot_df_total_top = pivot_df_total.sort_values('Total Count', ascending=False).head(top_n)
sorted_index = pivot_df_total_top.index
pivot_df_total_top = pivot_df_total_top.drop(columns='Total Count')
pivot_df_percent = profile.pivot(index='User: Profile: Name', columns='Whatfix User?', values='Percent').fillna(0)
pivot_df_percent_top = pivot_df_percent.loc[sorted_index]
colors = ['#8a2be2', '#ff6347']

fig, ax1 = plt.subplots(figsize=(12, 6))
pivot_df_total_top.plot(kind='bar', stacked=True, ax=ax1, color=colors)
ax1.set_title(f'Top {top_n} Profiles by Whatfix Users (Total)')
ax1.set_xlabel('User: Profile: Name')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
ax1.legend(title='Whatfix User?')
ax1.yaxis.set_visible(False)
for container in ax1.containers:
    ax1.bar_label(container, label_type='center', fontsize=10, color='black')

fig, ax2 = plt.subplots(figsize=(12, 6))
pivot_df_percent_top.plot(kind='bar', stacked=True, ax=ax2, color=colors)
ax2.set_title(f'Top {top_n} Profiles by Whatfix User Percentage')
ax2.set_xlabel('User: Profile: Name')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
ax2.legend(title='Whatfix User?')
ax2.yaxis.set_visible(False)
for container in ax2.containers:
    ax2.bar_label(container, labels=[f'{val:.1f}%' if val > 0 else '' for val in container.datavalues], label_type='center', fontsize=10, color='black')
plt.tight_layout()
plt.show()

In [None]:
#Question 4: What's the breakdown of Parent Organization by Whatfix vs Non-Whatfix Users, and their respective percentages?
parorg = sf_df.groupby(['Whatfix User?', 'Parent Organization']).size().reset_index(name='Total')
parorg['Percent'] = (parorg['Total'] / parorg.groupby('Parent Organization')['Total'].transform('sum')) * 100
parorg['Percent'] = parorg['Percent'].round(1)
parorg = parorg.sort_values(['Parent Organization', 'Whatfix User?'], ascending=[True, False]).reset_index(drop=True)
parorg

In [None]:
#Question 4 Visuals:
parorg = sf_df.groupby(['Whatfix User?', 'Parent Organization']).size().reset_index(name='Total')
parorg['Percent'] = (parorg['Total'] / parorg.groupby('Parent Organization')['Total'].transform('sum')) * 100
parorg['Percent'] = parorg['Percent'].round(1)
parorg = parorg.sort_values(['Parent Organization', 'Total'], ascending=[True, False]).reset_index(drop=True)
pivot_df_total = parorg.pivot(index='Parent Organization', columns='Whatfix User?', values='Total').fillna(0)
pivot_df_total['Total Count'] = pivot_df_total.sum(axis=1)
top_n = 10 # Adjust as needed
pivot_df_total_top = pivot_df_total.sort_values('Total Count', ascending=False).head(top_n)
sorted_index = pivot_df_total_top.index
pivot_df_total_top = pivot_df_total_top.drop(columns='Total Count')
pivot_df_percent = parorg.pivot(index='Parent Organization', columns='Whatfix User?', values='Percent').fillna(0)
pivot_df_percent_top = pivot_df_percent.loc[sorted_index]
colors = ['#32cd32', '#1e90ff']

fig, ax1 = plt.subplots(figsize=(12, 6))
pivot_df_total_top.plot(kind='bar', stacked=True, ax=ax1, color=colors)
ax1.set_title(f'Top {top_n} Parent Organizations by Whatfix Users (Total)')
ax1.set_xlabel('Parent Organization')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
ax1.legend(title='Whatfix User?')
ax1.yaxis.set_visible(False)
for container in ax1.containers:
    ax1.bar_label(container, label_type='center', fontsize=10, color='black')

fig, ax2 = plt.subplots(figsize=(12, 6))
pivot_df_percent_top.plot(kind='bar', stacked=True, ax=ax2, color=colors)
ax2.set_title(f'Top {top_n} Parent Organizations by Whatfix User Percentage')
ax2.set_xlabel('Parent Organization')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
ax2.legend(title='Whatfix User?')
ax2.yaxis.set_visible(False)
for container in ax2.containers:
    ax2.bar_label(container, labels=[f'{val:.1f}%' if val > 0 else '' for val in container.datavalues], label_type='center', fontsize=8, color='black')
plt.tight_layout()
plt.show()

In [None]:
#Question 5: What's the breakdown of EDS Primary Affiliation by Whatfix vs Non-Whatfix Users, and their respective percentages?
eds = sf_df.groupby(['Whatfix User?', 'EDS Primary Affiliation']).size().reset_index(name='Total')
eds['Percent'] = (eds['Total'] / eds.groupby('EDS Primary Affiliation')['Total'].transform('sum')) * 100
eds['Percent'] = eds['Percent'].round(1)
eds = eds.sort_values(['EDS Primary Affiliation', 'Whatfix User?'], ascending=[True, False]).reset_index(drop=True)
eds

In [None]:
#Question 5 Visuals:
eds = sf_df.groupby(['Whatfix User?', 'EDS Primary Affiliation']).size().reset_index(name='Total')
eds['Percent'] = (eds['Total'] / eds.groupby('EDS Primary Affiliation')['Total'].transform('sum')) * 100
eds['Percent'] = eds['Percent'].round(1)
eds = eds.sort_values(['EDS Primary Affiliation', 'Total'], ascending=[True, False]).reset_index(drop=True)
pivot_df_total = eds.pivot(index='EDS Primary Affiliation', columns='Whatfix User?', values='Total').fillna(0)
pivot_df_total['Total Count'] = pivot_df_total.sum(axis=1)
top_n = 10
pivot_df_total_top = pivot_df_total.sort_values('Total Count', ascending=False).head(top_n)
sorted_index = pivot_df_total_top.index
pivot_df_total_top = pivot_df_total_top.drop(columns='Total Count')
pivot_df_percent = eds.pivot(index='EDS Primary Affiliation', columns='Whatfix User?', values='Percent').fillna(0)
pivot_df_percent_top = pivot_df_percent.loc[sorted_index]
colors = ['#FF1493', '#40E0D0']

fig, ax1 = plt.subplots(figsize=(12, 6))
pivot_df_total_top.plot(kind='bar', stacked=True, ax=ax1, color=colors)
ax1.set_title(f'Top {top_n} EDS Primary Affiliation by Whatfix Users (Total)')
ax1.set_xlabel('EDS Primary Affiliation')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
ax1.legend(title='Whatfix User?')
ax1.yaxis.set_visible(False)
for container in ax1.containers:
    ax1.bar_label(container, label_type='center', fontsize=8, color='black')

fig, ax2 = plt.subplots(figsize=(12, 6))
pivot_df_percent_top.plot(kind='bar', stacked=True, ax=ax2, color=colors)
ax2.set_title(f'Top {top_n} EDS Primary Affiliation by Whatfix User Percentage')
ax2.set_xlabel('EDS Primary Affiliation')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
ax2.legend(title='Whatfix User?')
ax2.yaxis.set_visible(False)
for container in ax2.containers:
    ax2.bar_label(container, labels=[f'{val:.1f}%' if val > 0 else '' for val in container.datavalues], label_type='center', fontsize=9, color='black')
plt.tight_layout()
plt.show()

In [None]:
#Question 6: What's the breakdown of a user's Created Date by Whatfix vs Non-Whatfix Users, and their respective percentages?
date = sf_df.groupby(['Whatfix User?', 'Created Date']).size().reset_index(name='Total')
date['Percent'] = (date['Total'] / date.groupby('Created Date')['Total'].transform('sum')) * 100
date['Percent'] = date['Percent'].round(1)
date = date.sort_values(['Created Date', 'Whatfix User?'], ascending=[False, False]).reset_index(drop=True)
date

In [None]:
#Question 6 Visuals:
date = sf_df.groupby(['Whatfix User?', 'Created Date']).size().reset_index(name='Total')
date['Percent'] = (date['Total'] / date.groupby('Created Date')['Total'].transform('sum')) * 100
date['Percent'] = date['Percent'].round(1)
date = date.sort_values(['Created Date', 'Whatfix User?'], ascending=[False, False]).reset_index(drop=True)
pivot_df_total = date.pivot(index='Created Date', columns='Whatfix User?', values='Total').fillna(0)
pivot_df_total['Total Count'] = pivot_df_total.sum(axis=1)
top_n = 10 #Adjust as needed
pivot_df_total_top = pivot_df_total.sort_values('Total Count', ascending=False).head(top_n)
sorted_index = pivot_df_total_top.index
pivot_df_total_top = pivot_df_total_top.drop(columns='Total Count')
pivot_df_percent = date.pivot(index='Created Date', columns='Whatfix User?', values='Percent').fillna(0)
pivot_df_percent_top = pivot_df_percent.loc[sorted_index]
colors = ['#4682B4', '#FF00FF']

fig, ax1 = plt.subplots(figsize=(12, 6))
pivot_df_total_top.plot(kind='bar', stacked=True, ax=ax1, color=colors)
ax1.set_title(f'Top {top_n} Created Date by Whatfix Users (Total)')
ax1.set_xlabel('Created Date')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
ax1.legend(title='Whatfix User?')
ax1.yaxis.set_visible(False)
for container in ax1.containers:
    ax1.bar_label(container, label_type='center', fontsize=7, color='black')

fig, ax2 = plt.subplots(figsize=(12, 6))
pivot_df_percent_top.plot(kind='bar', stacked=True, ax=ax2, color=colors)
ax2.set_title(f'Top {top_n} Created Date by Whatfix User Percentage')
ax2.set_xlabel('Created Date')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
ax2.legend(title='Whatfix User?')
ax2.yaxis.set_visible(False)
for container in ax2.containers:
    ax2.bar_label(container, labels=[f'{val:.1f}%' if val > 0 else '' for val in container.datavalues], label_type='center', fontsize=10, color='black')

plt.tight_layout()
plt.show()

In [None]:
#Question 7: What's the breakdown of Roles by Whatfix vs Non-Whatfix Users, and their respective percentages?
role = sf_df.groupby(['Whatfix User?', 'User: Role: Name']).size().reset_index(name='Total')
role['Percent'] = (role['Total'] / role.groupby('User: Role: Name')['Total'].transform('sum')) * 100
role['Percent'] = role['Percent'].round(1)
role = role.sort_values(['User: Role: Name', 'Whatfix User?'], ascending=[True, False]).reset_index(drop=True)
role

In [None]:
#Question 7 Visuals:
role = sf_df.groupby(['Whatfix User?', 'User: Role: Name']).size().reset_index(name='Total')
role['Percent'] = (role['Total'] / role.groupby('User: Role: Name')['Total'].transform('sum')) * 100
role['Percent'] = role['Percent'].round(1)
role = role.sort_values(['User: Role: Name', 'Whatfix User?'], ascending=[True, False]).reset_index(drop=True)
pivot_df_total = role.pivot(index='User: Role: Name', columns='Whatfix User?', values='Total').fillna(0)
pivot_df_total['Total Count'] = pivot_df_total.sum(axis=1)
top_n = 10 #Adjust as needed
pivot_df_total_top = pivot_df_total.sort_values('Total Count', ascending=False).head(top_n)
sorted_index = pivot_df_total_top.index
pivot_df_total_top = pivot_df_total_top.drop(columns='Total Count')
pivot_df_percent = role.pivot(index='User: Role: Name', columns='Whatfix User?', values='Percent').fillna(0)
pivot_df_percent_top = pivot_df_percent.loc[sorted_index]
colors = ['#FF6347', '#32CD32']

fig, ax1 = plt.subplots(figsize=(12, 6))
pivot_df_total_top.plot(kind='bar', stacked=True, ax=ax1, color=colors)
ax1.set_title(f'Top {top_n} User Roles by Whatfix Users (Total)')
ax1.set_xlabel('User Role')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
ax1.legend(title='Whatfix User?')
ax1.yaxis.set_visible(False)
for container in ax1.containers:
    ax1.bar_label(container, label_type='center', fontsize=10, color='black')

fig, ax2 = plt.subplots(figsize=(12, 6))
pivot_df_percent_top.plot(kind='bar', stacked=True, ax=ax2, color=colors)
ax2.set_title(f'Top {top_n} User Roles by Whatfix User Percentage')
ax2.set_xlabel('User Role')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right')
ax2.legend(title='Whatfix User?')
ax2.yaxis.set_visible(False)
for container in ax2.containers:
    ax2.bar_label(container, labels=[f'{val:.1f}%' if val > 0 else '' for val in container.datavalues], label_type='center', fontsize=10, color='black')
plt.tight_layout()
plt.show()

In [None]:
#Question 8: How many employees work for each Department?
employee = sf_df.groupby('Primary Department: Account Name').size().reset_index(name='Total Employees')
employee = employee.sort_values('Total Employees', ascending=False).reset_index(drop=True)
employee

In [None]:
#Question 8 Visual:
top_n = 20 #Adjust as needed
employee = sf_df.groupby('Primary Department: Account Name').size().reset_index(name='Total Employees')
employee = employee.sort_values('Total Employees', ascending=False).reset_index(drop=True)
employee_top_n = employee.head(top_n)
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(employee_top_n['Primary Department: Account Name'], employee_top_n['Total Employees'], color='darkblue')
ax.set_title(f'Top {top_n} Departments by Total Employees')
ax.set_xlabel('Department')
ax.set_ylabel('Total Employees')
ax.set_xticks(range(len(employee_top_n)))  # Set the positions of the ticks
ax.set_xticklabels(employee_top_n['Primary Department: Account Name'], rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
#Question 10: What Roles had over 15 total employees?
grouped_df = sf_df.groupby('User: Role: Name').size().reset_index(name='Total Employees')
filtered_df = grouped_df[grouped_df['Total Employees'] >= 15]
sorted_df = filtered_df.sort_values('Total Employees', ascending=False).reset_index(drop=True)
sorted_df

In [None]:
#Question 10 Visual:
grouped_df = sf_df.groupby('User: Role: Name').size().reset_index(name='Total Employees')
filtered_df = grouped_df[grouped_df['Total Employees'] >= 15]
sorted_df = filtered_df.sort_values('Total Employees', ascending=False).reset_index(drop=True)
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(sorted_df['User: Role: Name'], sorted_df['Total Employees'], color='purple')
ax.set_title('Employee Count by Role (>= 15 Employees)')
ax.set_xlabel('Role')
ax.set_ylabel('Total Employees')
ax.set_xticks(range(len(sorted_df)))  # Set the positions of the ticks
ax.set_xticklabels(sorted_df['User: Role: Name'], rotation=45, ha='right')
plt.tight_layout()
plt.show()