Amanda Rodgers
May 29th, 2024
Pandas Basic Practice

In [7]:
# Import packages
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [8]:
# to get help and see documentation for pandas
# help()  Ex. help(df.isnull) to see how the isnull function works on a df
# print(df._doc_) ( ?? Need to figure out how to do this???)
# dir(df)  - shows functions you can use on df, a directory list of df functions or methods
# to see what a function does, click on it and hit shift tab and it will bring up documentation for that function
# type(object) - shows you the object type you are working with 

In [9]:
# Upload a csv dataset
df = pd.read_csv("/Users/amandarodgers/Documents/Data_Bootcamp/Python/data_sets/StudentScores.csv")
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [10]:
# Check column data types and number of columns and rows
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [11]:
# Display rows with any NaN values ( can also look at .info and compare non-null to h.m. entries)
rows_with_nan = df[df.isnull().any(axis=1)]

print("Rows with any NaN values:")
print(rows_with_nan)

Rows with any NaN values:
Empty DataFrame
Columns: [gender, race/ethnicity, parental level of education, lunch, test preparation course, math score, reading score, writing score]
Index: []


In [12]:
# Check for NaN values in each column
nan_in_columns = df.isnull().any()

print("NaN values in each column:")
print(nan_in_columns)

NaN values in each column:
gender                         False
race/ethnicity                 False
parental level of education    False
lunch                          False
test preparation course        False
math score                     False
reading score                  False
writing score                  False
dtype: bool


In [13]:
# To list the stats for the number columns in df
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [14]:
# To describe all of the columns including strings do below: 
# Will give you count and h.m. levels in categories, for example in gender column it says unique 2 (male, female)
print(df.describe(include='all'))

        gender race/ethnicity parental level of education     lunch  \
count     1000           1000                        1000      1000   
unique       2              5                           6         2   
top     female        group C                some college  standard   
freq       518            319                         226       645   
mean       NaN            NaN                         NaN       NaN   
std        NaN            NaN                         NaN       NaN   
min        NaN            NaN                         NaN       NaN   
25%        NaN            NaN                         NaN       NaN   
50%        NaN            NaN                         NaN       NaN   
75%        NaN            NaN                         NaN       NaN   
max        NaN            NaN                         NaN       NaN   

       test preparation course  math score  reading score  writing score  
count                     1000  1000.00000    1000.000000    1000.000000

In [15]:
# Change a column's data type ( this changes the math scores column to strings)
df['math score'] = df['math score'].astype(str)
print("\nDataFrame after changing data types:")
print(df)
print(df.dtypes)


DataFrame after changing data types:
     gender race/ethnicity parental level of education         lunch  \
0    female        group B           bachelor's degree      standard   
1    female        group C                some college      standard   
2    female        group B             master's degree      standard   
3      male        group A          associate's degree  free/reduced   
4      male        group C                some college      standard   
..      ...            ...                         ...           ...   
995  female        group E             master's degree      standard   
996    male        group C                 high school  free/reduced   
997  female        group C                 high school  free/reduced   
998  female        group D                some college      standard   
999  female        group D                some college  free/reduced   

    test preparation course math score  reading score  writing score  
0                      non

In [16]:
# Change math score data type back to an integer
df['math score'] = df['math score'].astype(int)
print("\nDataFrame after changing data types:")
print(df.dtypes)


DataFrame after changing data types:
gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object


In [17]:
# Create a subset df of just numerical columns naming the columns
numerical_columns_df = df[['math score', 'writing score', 'reading score' ]]
numerical_columns_df

Unnamed: 0,math score,writing score,reading score
0,72,74,72
1,69,88,90
2,90,93,95
3,47,44,57
4,76,75,78
...,...,...,...
995,88,95,99
996,62,55,55
997,59,65,71
998,68,77,78


In [18]:
# Create a dataframe using iloc, select columns by index starting with 0
race_ethnicity_scores_df = df.iloc[:, [1, 5, 6, 7]]
race_ethnicity_scores_df

Unnamed: 0,race/ethnicity,math score,reading score,writing score
0,group B,72,72,74
1,group C,69,90,88
2,group B,90,95,93
3,group A,47,57,44
4,group C,76,78,75
...,...,...,...,...
995,group E,88,99,95
996,group C,62,55,55
997,group C,59,71,65
998,group D,68,78,77


In [22]:
# Create a df using a variable
columns_to_select = ['parental level of education', 'math score', 'writing score', 'reading score']
parent_education_df = df[columns_to_select]
parent_education_df

Unnamed: 0,parental level of education,math score,writing score,reading score
0,bachelor's degree,72,74,72
1,some college,69,88,90
2,master's degree,90,93,95
3,associate's degree,47,44,57
4,some college,76,75,78
...,...,...,...,...
995,master's degree,88,95,99
996,high school,62,55,55
997,high school,59,65,71
998,some college,68,77,78


In [24]:
# Change the name of a column using rename method
parent_education_df.rename(columns={'parental level of education': 'parental_level_of_education'}, inplace=True)
parent_education_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parent_education_df.rename(columns={'parental level of education': 'parental_level_of_education'}, inplace=True)


Unnamed: 0,parental_level_of_education,math score,writing score,reading score
0,bachelor's degree,72,74,72
1,some college,69,88,90
2,master's degree,90,93,95
3,associate's degree,47,44,57
4,some college,76,75,78
...,...,...,...,...
995,master's degree,88,95,99
996,high school,62,55,55
997,high school,59,65,71
998,some college,68,77,78


In [25]:
# Change name of columns using columns attribute, just type what the new names will be
parent_education_df.columns = ['parental_level_of_education', 'math_score', 'writing score', 'reading score']  # Renaming columns directly
parent_education_df

Unnamed: 0,parental_level_of_education,math_score,writing score,reading score
0,bachelor's degree,72,74,72
1,some college,69,88,90
2,master's degree,90,93,95
3,associate's degree,47,44,57
4,some college,76,75,78
...,...,...,...,...
995,master's degree,88,95,99
996,high school,62,55,55
997,high school,59,65,71
998,some college,68,77,78
