In [59]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('salaries_by_college_major.csv')

# Display the first 60 rows
print(df.head(60))

# Finding the shape of the dataset
print(df.shape)

# Accessing column names
print(df.columns)

# Checking if there are any NaN values
print(df.isna().sum())

# Check the last rows
print(df.tail())

# Dropping NaN values
data = df.dropna()
print(data.head(60))

# Highest and lowest paid undergraduate major first two years after graduation
starting_career = data["Starting Median Salary"]
max_salary_for_starter = starting_career.max()
min_salary_for_starter = starting_career.min()
index_for_min_starter = starting_career.idxmin()
index_for_max_starter = starting_career.idxmax()
highestPaid_starter_major = data["Undergraduate Major"][index_for_max_starter]
lowestPaid_starter_major = data["Undergraduate Major"][index_for_min_starter]
print(f"The highest paid undergraduate major is: {highestPaid_starter_major} for {max_salary_for_starter} USD")
print(f"The lowest paid undergraduate major is: {lowestPaid_starter_major} for {min_salary_for_starter} USD")

# Finding the highest and lowest paid major for mid-career (10 years after graduation)
mid_career = data["Mid-Career Median Salary"]
max_midcareer_salary = mid_career.max()
min_midcareer_salary = mid_career.min()
index_for_midcareer = mid_career.idxmax()
index_min_for_midcareer = mid_career.idxmin()
highest_paid_midcareer_job = data["Undergraduate Major"][index_for_midcareer]
lowest_paid_midcareer_job = data["Undergraduate Major"][index_min_for_midcareer]
print(f"The highest paid undergraduate major for mid-career is: {highest_paid_midcareer_job} for {max_midcareer_salary} USD")
print(f"The lowest paid undergraduate major for mid-career is: {lowest_paid_midcareer_job} for {min_midcareer_salary} USD")

# Displaying specific rows
print(data.loc[index_for_min_starter])
print(data.loc[index_for_midcareer])
print(data.loc[index_min_for_midcareer])

# Finding the lowest risk majors
safety_majors = data["Mid-Career 90th Percentile Salary"] - data["Mid-Career 10th Percentile Salary"]
data.insert(5, "Spread", safety_majors)
print(data.head())

# Sorting by the lowest spread
low_risk = data.sort_values("Spread")
print(low_risk[['Undergraduate Major', 'Spread']].head())

# Sorting degrees by the earning salaries of 90th percentile in mid-career
high_potential = data.sort_values("Mid-Career 90th Percentile Salary", ascending=False)
print(high_potential[["Undergraduate Major", "Mid-Career 90th Percentile Salary"]].head(5))

# Degree with the highest spread
spread = data.sort_values("Spread", ascending=False)
print(spread[["Undergraduate Major", "Spread"]].head(5))

# Safest major
list_major = spread["Undergraduate Major"].to_list()
print(f"The safest major is: {list_major[0]}")

# Grouping by 'Group' column and calculating mean only for numeric columns
pd.options.display.float_format = "{:,.2f}".format
numeric_data = data.select_dtypes(include=['number'])
group_mean = numeric_data.groupby(data['Group']).mean()
print(group_mean)

                     Undergraduate Major  Starting Median Salary  \
0                             Accounting                 46000.0   
1                  Aerospace Engineering                 57700.0   
2                            Agriculture                 42600.0   
3                           Anthropology                 36800.0   
4                           Architecture                 41600.0   
5                            Art History                 35800.0   
6                                Biology                 38800.0   
7                    Business Management                 43000.0   
8                   Chemical Engineering                 63200.0   
9                              Chemistry                 42600.0   
10                     Civil Engineering                 53900.0   
11                        Communications                 38100.0   
12                  Computer Engineering                 61400.0   
13                      Computer Science        