# Interview Data Analysis

In [None]:
#Importing Pandas and Matplotlib for the data analysis
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Reading in the Interview csv file
df = pd.read_csv(r"C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\ExcelHandshake\HS_Scorecard_Interviews.csv",
                encoding = 'ISO-8859-1')
#Changing the row and column limit so that all the rows and columns are displayed
pd.set_option('display.max.rows', 300)
pd.set_option('display.max.columns', 10)
df

In [None]:
#Getting a general overview of the data
df.info()

In [None]:
#There is one NaN row that needs to be dropped
df = df.dropna()

In [None]:
#Changing the Employer ID column from a Float to an Int data type
df['Employer ID'] = df['Employer ID'].astype(int)

In [None]:
#Changing the Interview Date column from an Object to a Date data type
df['Interview Date'] = pd.to_datetime(df['Interview Date'])

In [None]:
#Question 1 Code: How many times did each Employer host an Interview?

group_by_employer = df.groupby(['Employer Name']).agg(num_times_attended = ('Interview Date', 'count')).reset_index()
group_by_employer_sorted = group_by_employer.sort_values(by='num_times_attended', ascending=False).reset_index(drop=True)
(group_by_employer_sorted)

In [None]:
#Question 1 Bar Graph: How many times did each Employer host an Interview?
#Note: To change the number of Employers displayed, put a number between 1 and 254 in the first parenthesis below!
#Insights: Honeywell, TTI, and E & J hosted the most interviews.

top_n_employers = group_by_employer_sorted.head(25)
plt.figure(figsize=(15, 5))
plt.bar(top_n_employers['Employer Name'], top_n_employers['num_times_attended'], color='orange')
plt.title('Interviews Per Employer')
plt.xlabel('Employer Name')
plt.ylabel('Number of Interviews')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 2 Code: How many Interviews did each Career Center host?

group_by_career_center = df.groupby(['Career Center Name']).agg(num_interviews = ('Interview Date', 'count')).reset_index()
group_by_career_center_sorted = group_by_career_center.sort_values(by='num_interviews', ascending=False).reset_index(drop=True)
(group_by_career_center_sorted)

In [None]:
#Question 2 Bar Graph: How many Interviews did each Career Center host?
#Insights: CAPLA and Eller hosted the most interviews.

plt.figure(figsize=(7, 5))
plt.bar(group_by_career_center_sorted['Career Center Name'], group_by_career_center_sorted['num_interviews'], color='orange')
plt.title('Number of Interviews Per Career Center')
plt.xlabel('Career Center')
plt.ylabel('Number of Interviews')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 3 Line Graph: When did these Interviews take place?
#Insights: During Quarter 1 of 2022 and 2023, the number of Interviews spiked up significantly.

df['Quarter'] = df['Interview Date'].dt.to_period('Q')
quarterly_data = df.groupby('Quarter').size()
plt.figure(figsize=(10, 6))
quarterly_data.plot(kind='line', marker='o', color='orange')
plt.title('Number of Interviews by Quarter')
plt.xlabel('Quarter')
plt.ylabel('Number of Interviews')
plt.grid(False)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Events Data Analysis

In [None]:
#Reading in the Events csv file
df2 = pd.read_csv(r"C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\ExcelHandshake\HS_Scorecard_Events.csv",
                encoding = 'ISO-8859-1')
#Changing the row and column limit so that all the rows and columns are displayed
pd.set_option('display.max.rows', 1600)
pd.set_option('display.max.columns', 10)
df2

In [None]:
#Getting a general overview of the data
df2.info()

In [None]:
#Dropping row with index 1527 because it's Null
df2 = df2.drop(1527)

In [None]:
#Changing data type of Events ID column from a Float to an Integer
df2['Events ID'] = df2['Events ID'].astype(int)

In [None]:
#Changing data type of Events Start Date column from a String to a Date
df2['Events Start Date'] = pd.to_datetime(df2['Events Start Date'])

In [None]:
#Changing data type of Attendees Count column from a Float to an Integer
df2['Attendees Count'] = df2['Attendees Count'].astype(int)

In [None]:
#Most of the records in the Career Center Name are Null, so they're replaced with 'Unknown'
df2 = df2.fillna('Unknown')

In [None]:
#Changing 3 column names to more concise names
df2.rename(columns={'Events Start Date': 'Events Date'}, inplace = True)

In [None]:
df2.rename(columns={'Event Type Name': 'Event Type'}, inplace = True)

In [None]:
df2.rename(columns={'Career Center Name': 'Career Center'}, inplace = True)

In [None]:
#Question 1 Code: How many times was each Event Type held?

group_by_event_type = df2.groupby(['Event Type']).agg(num_employers = ('Employers Name', 'count')).reset_index()
group_by_event_type_sorted = group_by_event_type.sort_values(by='num_employers', ascending=False).reset_index(drop=True)
group_by_event_type_sorted

In [None]:
#Question 1 Bar Graph: How many times was each Event Type held?
#Insights: The Info Session Event Type was by far held the most amount of times.

plt.figure(figsize=(13, 7))
plt.bar(group_by_event_type_sorted['Event Type'], group_by_event_type_sorted['num_employers'], color='green')
plt.title('Total Times Each Event Type Was Held')
plt.xlabel('Event Type')
plt.ylabel('Number of Times')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 2 Code: How many Attendees were there for each Employer

group_by_employer_attendees = df2.groupby(['Employers Name']).agg(Total_Attendees = ('Attendees Count', 'sum')).reset_index()
group_by_employer_attendees_sorted = group_by_employer_attendees.sort_values(by='Total_Attendees', ascending=False).reset_index(drop=True)
pd.set_option('display.max.rows', 500)
group_by_employer_attendees_sorted

In [None]:
#Question 2 Bar Graph: How many Attendees were there for each Employer
#Insights: Goldman Sachs had the largest attendees by far, with Google, RTX, and Ancestry following it.

top_n__employers = group_by_employer_attendees_sorted.head(10)
plt.figure(figsize=(15, 15))
plt.bar(top_n__employers['Employers Name'], top_n__employers['Total_Attendees'], color='green')
plt.title('Total Attendees Per Employer')
plt.xlabel('Employers Name')
plt.ylabel('Total Attendees')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 3 Code: How many Events did each Employer host?

group_by_employers_total = df2.groupby(['Employers Name']).agg(Total_Times_Attended = ('Events Date', 'count')).reset_index()
group_by_employers_total_sorted = group_by_employers_total.sort_values(by='Total_Times_Attended', ascending=False).reset_index(drop=True)
pd.set_option('display.max.rows', 500)
group_by_employers_total_sorted

In [None]:
#Question 3 Bar Graph: How many Events did each Employer host?
#Insights: Morgan Stanley, EY, Mayo Clinic, and Goldman Sachs hosted the most Events.

top__n__employers = group_by_employers_total_sorted.head(10)
plt.figure(figsize=(15, 15))
plt.bar(top__n__employers['Employers Name'], top__n__employers['Total_Times_Attended'], color='green')
plt.title('Total Times Each Employer Attended')
plt.xlabel('Employers Name')
plt.ylabel('Total Times')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 4 Code: How many Attendees were there for each Event Type?

group_by_attendees_type = df2.groupby(['Event Type']).agg(Total_Attendees = ('Attendees Count', 'sum')).reset_index()
group_by_attendees_type_sorted = group_by_attendees_type.sort_values(by='Total_Attendees', ascending=False).reset_index(drop=True)
group_by_attendees_type_sorted

In [None]:
#Question 4 Bar Graph: How many Attendees were there for each Event Type?
#Insights: Info Sessions had the most attendees by far.

plt.figure(figsize=(15, 15))
plt.bar(group_by_attendees_type_sorted['Event Type'], group_by_attendees_type_sorted['Total_Attendees'], color='green')
plt.title('Total Attendees Per Event Type')
plt.xlabel('Event Type')
plt.ylabel('Total Attendees')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 5 Code: How many Events did each Career Center host?

group_by_career_center = df2.groupby(['Career Center']).agg(Total_Employers = ('Employers Name', 'count')).reset_index()
group_by_career_center_sorted = group_by_career_center.sort_values(by='Total_Employers', ascending=False).reset_index(drop=True)
group_by_career_center_sorted

In [None]:
#Question 5 Bar Graph: How many Events did each Career Center host?
#Insights: Most of the Employers did not have an associated Career Center, so this analysis proves useless.

plt.figure(figsize=(15, 15))
plt.bar(group_by_career_center_sorted['Career Center'], group_by_career_center_sorted['Total_Employers'], color='green')
plt.title('Total Employer Attendees Per Career Center')
plt.xlabel('Career Center')
plt.ylabel('Total Employer Attendees')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 6 Code: How many Attendees attended each Event?

group_by_events_name = df2.groupby(['Events Name']).agg(Total_Employers = ('Attendees Count', 'sum')).reset_index()
group_by_events_name_sorted = group_by_events_name.sort_values(by='Total_Employers', ascending=False).reset_index(drop=True)
group_by_events_name_sorted

In [None]:
#Question 6 Code: How many Attendees attended each Event?
#Insights: Retail Fuse had the most attendees by far.

top____n____employers = group_by_events_name_sorted.head(10)
plt.figure(figsize=(15, 15))
plt.bar(top____n____employers['Events Name'], top____n____employers['Total_Employers'], color='green')
plt.title('Total Employer Attendees Per Event')
plt.xlabel('Event Name')
plt.ylabel('Total Employer Attendees')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 7 Line Graph: When did these Events take place?
#Insights: During Quarters 3 & 4 of 2022 and 2023, the number of Events spiked up significantly.

df2['Quarter'] = df2['Events Date'].dt.to_period('Q')
quarterly_data = df2.groupby('Quarter').size()
plt.figure(figsize=(10, 6))
quarterly_data.plot(kind='line', marker='o', color='green')
plt.title('Number of Events by Quarter')
plt.xlabel('Quarter')
plt.ylabel('Number of Events')
plt.grid(False)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Career Fair Data Analysis


In [None]:
#Reading in the Career Fair csv file
df3 = pd.read_csv(r"C:\Users\shiha\Downloads\Projects\Work Projects\ITJob_projects\ExcelHandshake\HS_Scorecard_CareerFairs.csv",
                encoding = 'ISO-8859-1')
#Changing the row and column limit so that all the rows and columns are displayed
pd.set_option('display.max.rows', 2500)
pd.set_option('display.max.columns', 10)
df3

In [None]:
#Getting a general overview of the data
df3.info()

In [None]:
#Dropping row with index 2320 because it's Null
df3 = df3.drop(2320)

In [None]:
#Changing Career Fair ID column from a Float to an Int
df3['Career Fair ID'] = df3['Career Fair ID'].astype(int)

In [None]:
#Changing Employers Count column from a Float to an Int
df3['Employers Count'] = df3['Employers Count'].astype(int)

In [None]:
#Changing Career Fair Dates and Times Start Date column from a String to a Date
df3['Career Fair Dates and Times Start Date'] = pd.to_datetime(df3['Career Fair Dates and Times Start Date'])

In [None]:
#Changing 2 column names to more concise names
df3.rename(columns={'Career Fair Dates and Times Start Date': 'Career Fairs Date'}, inplace = True)

In [None]:
df3.rename(columns={'Career Center Name': 'Career Center'}, inplace = True)

In [None]:
#Question 1 Code: How many times did each Employer attend a Career Fair?

group_by_employer_name = df3.groupby(['Employers Name']).agg(num_of_employers = ('Employers Count', 'sum')).reset_index()
group_by_employer_name_sorted = group_by_employer_name.sort_values(by='num_of_employers', ascending=False).reset_index(drop=True)
group_by_employer_name_sorted

In [None]:
import numpy as np
#Question 1 Bar Graph: How many times did each Employer attend a Career Fair?
#Insights: BASIS attended the most career fairs, without any other notable trends.

top______n______employers = group_by_employer_name_sorted.head(10)
plt.figure(figsize=(15, 5))
plt.bar(top______n______employers['Employers Name'], top______n______employers['num_of_employers'], color='blue')
plt.title('Career Fairs Per Employer')
plt.xlabel('Employer Name')
plt.ylabel('Number of Career Fairs')
plt.xticks(rotation=90)
plt.yticks(np.arange(0, 22, step=2))
plt.show()

In [None]:
#Question 2 Code: How many Employers attended each Career Fair?

group_by_career_fair_name = df3.groupby(['Career Fairs Name']).agg(num_of_employers = ('Career Fairs Name', 'count')).reset_index()
group_by_career_fair_name_sorted = group_by_career_fair_name.sort_values(by='num_of_employers', ascending=False).reset_index(drop=True)
group_by_career_fair_name_sorted

In [None]:
#Question 2 Bar Graph: How many Employers attended each Career Fair?
#Insights: Fall 2022 Eller Expo had the most employer attendees, with Fall 2023 and 2021 Eller Expos following.

top______n______employers = group_by_employer_name_sorted.head(10)
plt.figure(figsize=(15, 5))
plt.bar(group_by_career_fair_name_sorted['Career Fairs Name'], group_by_career_fair_name_sorted['num_of_employers'], color='blue')
plt.title('Employer Attendees Per Career Fair')
plt.xlabel('Career Fair')
plt.ylabel('Number Of Employers')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Question 3 Line Graph: When did these Career Fairs take place?
#Insights: During Quarters 1 & 3 of 2022 and 2023, the number of Career Fairs spiked up significantly.

df3['Quarter'] = df3['Career Fairs Date'].dt.to_period('Q')
quarterly_data = df3.groupby('Quarter').size()
plt.figure(figsize=(10, 6))
quarterly_data.plot(kind='line', marker='o', color='blue')
plt.title('Number of Career Fairs by Quarter')
plt.xlabel('Quarter')
plt.ylabel('Number of Career Fairs')
plt.grid(False)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()