In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 6)
wf_df = pd.read_csv(r'C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\TableauWhatfix\whatfix.csv')
wf_df

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 6)
campus_df = pd.read_csv(r'C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\TableauWhatfix\campuswide.csv', encoding = 'ISO-8859-1')
campus_df

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 6)
sep_df = pd.read_csv(r'C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\TableauWhatfix\sep.csv', encoding = 'ISO-8859-1')
sep_df

### Step 1: Data Cleaning Phase

In [None]:
#1a: Checking general info
wf_df.info()
campus_df.info()
sep_df.info()

### Step 2: Data Analysis Phase

In [None]:
#Question 1: How many users are Trellis users (1) vs non-Trellis users (0)?
trellis_users = campus_df.groupby('Trellis User').size().reset_index(name='Total Users').sort_values(by='Total Users', ascending=False)
trellis_users

In [None]:
#Question 1 Visual:
plt.figure(figsize=(5, 5))
plt.pie(trellis_users['Total Users'], labels=trellis_users['Trellis User'], autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Trellis Users')
plt.axis('equal')
plt.show()

In [None]:
#Question 2: How many users are Active users (1) vs non-Active users (0)?
active_users = campus_df.groupby('User: Active').size().reset_index(name='Total Users').sort_values(by='Total Users', ascending=False)
active_users

In [None]:
#Question 2 Visual:
plt.figure(figsize=(5, 5))
plt.pie(active_users['Total Users'], labels=active_users['User: Active'], autopct='%1.1f%%', startangle=90)
plt.title('Active Users vs Non-Active Users')
plt.axis('equal')
plt.show()

In [None]:
#Question 3: How many Employees are there per Job Title?
employees = campus_df.groupby('Title').size().reset_index(name='NumberofEmployees').sort_values(by='NumberofEmployees', ascending=False).reset_index(drop=True)
employees

In [None]:
#Question 3 Visual:
top_20_titles = employees.head(25) #Change this number to show Top N
plt.figure(figsize=(11, 6))
bars = plt.barh(top_20_titles['Title'], top_20_titles['NumberofEmployees'], color='blue')
for bar in bars:
    plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, 
             f'{int(bar.get_width())}', 
             va='center', ha='left', fontsize=10)
plt.xlabel('Number of Employees')
plt.title('Top 20 Job Titles by Number of Employees')
plt.gca().invert_yaxis()
plt.gca().axes.get_xaxis().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
#Question 4: How many Employees are there per Job Family?
jobfam = campus_df.groupby('Job Family').size().reset_index(name='NumberofEmployees').sort_values(by='NumberofEmployees', ascending=False).reset_index(drop=True)
jobfam

In [None]:
#Question 4 Visual:
top_20_job_families = jobfam.head(25) #Change this number to show Top N
plt.figure(figsize=(10, 6))
bars = plt.barh(top_20_job_families['Job Family'], top_20_job_families['NumberofEmployees'], color='green')
for bar in bars:
    plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, 
             f'{int(bar.get_width())}', 
             va='center', ha='left', fontsize=10)
plt.xlabel('Number of Employees')
plt.title('Top 20 Job Families by Number of Employees')
plt.gca().invert_yaxis()
plt.gca().axes.get_xaxis().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
#Question 5: How many Employees are there per Job Function (excluding the nulls)?
jobfunc = campus_df.groupby('Job Function').size().reset_index(name='NumberofEmployees').sort_values(by='NumberofEmployees', ascending=False).reset_index(drop=True)
jobfunc

In [None]:
#Question 5 Visual:
plt.figure(figsize=(12, 6))
plt.bar(jobfunc['Job Function'], jobfunc['NumberofEmployees'], color='orange')
plt.xlabel('Job Function')
plt.ylabel('Number of Employees')
plt.title('Job Functions by Number of Employees')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
#Question 6: How many Employees are there per Parent Organization?
parorg = campus_df.groupby('Parent Organization').size().reset_index(name='NumberofEmployees').sort_values(by='NumberofEmployees', ascending=False).reset_index(drop=True)
parorg

In [None]:
#Question 6 Visual:
top_20_parent_org = parorg.head(25) #Change this number to show Top N
plt.figure(figsize=(12, 6))
bars = plt.barh(top_20_parent_org['Parent Organization'], top_20_parent_org['NumberofEmployees'], color='red')
for bar in bars:
    plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, 
             f'{int(bar.get_width())}', 
             va='center', ha='left', fontsize=10)
plt.xlabel('Number of Employees')
plt.title('Parent Organizations by Number of Employees')
plt.gca().invert_yaxis()
plt.gca().axes.get_xaxis().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
#Question 7: How many Employees are there per Account Name?
accountname = campus_df.groupby('Organization: Account Name').size().reset_index(name='NumberofEmployees').sort_values(by='NumberofEmployees', ascending=False).reset_index(drop=True)
accountname

In [None]:
#Question 7 Visual:
top_25_result = accountname.head(25) #Change this number to show Top N
plt.figure(figsize=(12, 6))
bars = plt.barh(top_25_result['Organization: Account Name'], top_25_result['NumberofEmployees'], color='purple')
for bar in bars:
    plt.text(bar.get_width() + 1, bar.get_y() + bar.get_height() / 2, 
             f'{int(bar.get_width())}', 
             va='center', ha='left', fontsize=10)
plt.xlabel('Number of Employees')
plt.title('Top 25 Account Names by Number of Employees')
plt.gca().invert_yaxis()
plt.gca().axes.get_xaxis().set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
#Question 8: How many Employees are there per Profile Name (excluding the nulls)?
profilename = campus_df.groupby('User: Profile: Name').size().reset_index(name='NumberofEmployees').sort_values(by='NumberofEmployees', ascending=False).reset_index(drop=True)
profilename

In [None]:
#Question 8 Visual:
plt.figure(figsize=(12, 6))
plt.bar(profilename['User: Profile: Name'], profilename['NumberofEmployees'], color='yellow')
plt.xlabel('Profile Name')
plt.ylabel('Number of Employees')
plt.title('Profile Names by Number of Employees')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
#Question 9: How many Employees are there per Role?
role = campus_df.groupby('Role').size().reset_index(name='NumberofEmployees').sort_values(by='NumberofEmployees', ascending=False).reset_index(drop=True)
role

In [None]:
#Question 9 Visual:
plt.figure(figsize=(12, 6))
plt.bar(role['Role'], role['NumberofEmployees'], color='skyblue')
plt.xlabel('Role')
plt.ylabel('Number of Employees')
plt.title('Roles by Number of Employees')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
#Question 10: How many Employees Engaged with Whatfix and how many didn't?
whatfixengagement = wf_df.groupby('Engaged with Whatfix?').size().reset_index(name='TotalEmployees').sort_values(by='TotalEmployees', ascending=False).reset_index(drop=True)
whatfixengagement

In [None]:
#Question 10 Visual:
plt.figure(figsize=(5, 5))
plt.pie(whatfixengagement['TotalEmployees'], labels=whatfixengagement['Engaged with Whatfix?'], autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
plt.title('Engagement with Whatfix by Total Employees')
plt.axis('equal')
plt.show()

In [None]:
#Question 11: What are the top 10 users based on the Number of Engagements with Whatfix?
top_10_engagements = wf_df[['User Name', 'Number of Engagements']].sort_values(by='Number of Engagements', ascending=False).head(10).reset_index(drop=True)
top_10_engagements

In [None]:
#Question 11 Visual:
plt.figure(figsize=(7, 5))
plt.barh(top_10_engagements['User Name'], top_10_engagements['Number of Engagements'], color='green')
plt.xlabel('Number of Engagements')
plt.ylabel('Email')
plt.title('Top 10 Emails by Number of Engagements')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
#Question 12: What are the Sum and Average Engagements (ex: total logins, total cases) by Role?
agg_df = sep_df.groupby('UserRole.Name.x').agg(
    sum_tot_logins=('tot_logins', 'sum'),
    avg_tot_logins=('tot_logins', 'mean'),
    sum_tot_cases=('tot_cases', 'sum'),
    avg_tot_cases=('tot_cases', 'mean'),
    sum_tot_case_details=('tot_case_details', 'sum'),
    avg_tot_case_details=('tot_case_details', 'mean'),
    sum_tot_events=('tot_events', 'sum'),
    avg_tot_events=('tot_events', 'mean'),
    sum_tot_campaigns=('tot_campaigns', 'sum'),
    avg_tot_campaigns=('tot_campaigns', 'mean'),
    sum_tot_case_updates=('tot_case_updates', 'sum'),
    avg_tot_case_updates=('tot_case_updates', 'mean'),
    sum_tot_appt=('tot_appt', 'sum'),
    avg_tot_appt=('tot_appt', 'mean'),
    sum_tot_avail=('tot_avail', 'sum'),
    avg_tot_avail=('tot_avail', 'mean'),
    sum_tot_act=('tot_act', 'sum'),
    avg_tot_act=('tot_act', 'mean')
).reset_index()

agg_df = agg_df.round({
    'avg_tot_logins': 0,
    'avg_tot_cases': 0,
    'avg_tot_case_details': 0,
    'avg_tot_events': 0,
    'avg_tot_campaigns': 0,
    'avg_tot_case_updates': 0,
    'avg_tot_appt': 0,
    'avg_tot_avail': 0,
    'avg_tot_act': 0
})

agg_df = agg_df.sort_values(by='sum_tot_logins', ascending=False).reset_index(drop=True)

agg_df

In [None]:
#Question 12 VisuaL:
top_n = 10  # Change this to the desired number of top roles

metrics = [
    ('sum_tot_logins', 'Total Logins'),
    ('avg_tot_logins', 'Average Logins'),
    ('sum_tot_cases', 'Total Cases'),
    ('avg_tot_cases', 'Average Cases'),
    ('sum_tot_case_details', 'Total Case Details'),
    ('avg_tot_case_details', 'Average Case Details'),
    ('sum_tot_events', 'Total Events'),
    ('avg_tot_events', 'Average Events'),
    ('sum_tot_campaigns', 'Total Campaigns'),
    ('avg_tot_campaigns', 'Average Campaigns'),
    ('sum_tot_case_updates', 'Total Case Updates'),
    ('avg_tot_case_updates', 'Average Case Updates'),
    ('sum_tot_appt', 'Total Appointments'),
    ('avg_tot_appt', 'Average Appointments'),
    ('sum_tot_avail', 'Total Availability'),
    ('avg_tot_avail', 'Average Availability'),
    ('sum_tot_act', 'Total Activities'),
    ('avg_tot_act', 'Average Activities')
]

fig, axes = plt.subplots(nrows=6, ncols=3, figsize=(18, 18))
axes = axes.flatten()

for i, (metric, title) in enumerate(metrics):
    top_roles = agg_df.sort_values(by=metric, ascending=False).head(top_n)

    sns.barplot(x=top_roles[metric], y=top_roles['UserRole.Name.x'], ax=axes[i], palette='viridis')

    axes[i].set_title(f'Top {top_n} User Roles by {title}')
    axes[i].set_xlabel(title)
    axes[i].set_ylabel('')

plt.tight_layout()
plt.show()