#  Petite Pandas Data Analysis Main Hack
- toc: true

In [55]:
import numpy as np
import pandas as pd

# Extra: Dataset generator
> Uses numpy to make a dataset

In [56]:
# Define a list of common names for student generation
names = ["Fred", "Willam", "Joe", "Robert", "Colin", "Ethan", "James", "Connor", "Thomas", "Hunter", "Jaden", "Leonard", "Fredrick", "Billy"]

def generateStudent(id):
    student = [id]
    
    # Name
    name = names[np.random.randint(0, len(names)-1)] + " " # First name
    name += names[np.random.randint(0, len(names)-1)][0] + ". " # Middle initial
    name += names[np.random.randint(0, len(names)-1)] # Last name
    student.append(name)

    # Hours spent on hw (per week)
    hoursOnHw = np.random.random() * 10 + 0.1 # Generates float from 0.1 to 10.1
    hoursOnHw = np.log2(hoursOnHw) + 1 # Simulates a plateu of hours on hw 
    student.append(hoursOnHw)

    # Grade (GPA format)
    skill = np.random.random() * 1.5 + 0.5 # Generates a random skill multiplier from 0.5 to 2
    score = round(hoursOnHw * skill)
    if(score>5):
        score = 5
    elif(score<1):
        score = 1
    student.append(score)

    return student

# Create students
students = []
for i in range(100):
    students.append(generateStudent(i))

# Print in csv format
print("id,name,hoursOnHw,score")
for student in students:
    print(student[0], end=",")
    print(student[1], end=",")
    print(student[2], end=",")
    print(student[3])
    

id,name,hoursOnHw,score
0,James L. James,3.237205499127559,2
1,Thomas F. Willam,1.8866150062581544,3
2,Robert L. Willam,3.9336736423431753,5
3,Fredrick H. Hunter,3.089383805772319,2
4,Thomas C. Willam,1.118440208260399,1
5,Hunter L. Leonard,1.413370650835248,2
6,Robert C. Hunter,3.4846642778582115,5
7,Robert J. Connor,2.2668694379195915,2
8,Leonard H. Thomas,2.136136212410632,1
9,Thomas C. Colin,2.2035175947710277,3
10,Thomas R. Colin,3.267364205364564,4
11,Fred C. Thomas,3.976922262810806,4
12,Jaden J. Leonard,0.008254586224481053,1
13,Jaden W. Joe,3.877247058500784,5
14,Leonard F. Hunter,3.2444904915989166,5
15,Fred R. Connor,2.3820990139856155,4
16,Colin R. Fred,3.122709988019411,3
17,Hunter E. Hunter,3.7295739088197752,5
18,Jaden C. Jaden,3.6487428868550107,4
19,Joe C. Thomas,0.5141759632476433,1
20,Ethan J. Colin,1.195405891647107,1
21,Connor J. Robert,3.554175950701118,5
22,Connor J. Fredrick,4.044149690758513,5
23,Leonard E. Fredrick,2.7704028263655465,2
24,Fred W. Fred,2.430296

# Pandas
> Analyze data

In [57]:
students = pd.DataFrame(np.genfromtxt('files/students.csv', delimiter=',', dtype=str, encoding='utf-8'))

print(students)

      0                   1                     2      3
0    id                name             hoursOnHw  score
1     0   Leonard W. Hunter     4.249093837849809      3
2     1  Fredrick R. Hunter    1.4185357714076718      2
3     2    Willam T. Willam    -0.952639561629149      1
4     3    Leonard J. Ethan   0.45393251649677235      1
..   ..                 ...                   ...    ...
96   95     Fred J. Leonard     3.415124645123977      4
97   96       Joe H. Thomas    2.6327753622042156      1
98   97    James F. Leonard    3.0453511539690554      2
99   98    Thomas L. Thomas  0.031164846240109734      1
100  99  Willam J. Fredrick    2.5751866038250726      2

[101 rows x 4 columns]


## Analyze data
> Find max, min, and median

In [58]:
import pandas as pd

df = pd.read_csv('files/students.csv')

hoursOnHw_max = df['hoursOnHw'].max()
hoursOnHw_min = df['hoursOnHw'].min()
hoursOnHw_mean = df['hoursOnHw'].mean()
hoursOnHw_median = df['hoursOnHw'].median()

score_max = df['score'].max()
score_min = df['score'].min()
score_mean = df['score'].mean()
score_median = df['score'].median()

print(f'Max hoursOnHw: {hoursOnHw_max}')
print(f'Min hoursOnHw: {hoursOnHw_min}')
print(f'Average hoursOnHw: {hoursOnHw_mean}')
print(f'Median hoursOnHw: {hoursOnHw_median}')
print(f'Max score: {score_max}')
print(f'Min score: {score_min}')
print(f'Average score: {score_mean}')
print(f'Median score: {score_median}')

Max hoursOnHw: 4.322522513071912
Min hoursOnHw: -1.2196529681089996
Average hoursOnHw: 2.9180861885936635
Median hoursOnHw: 3.2371786874994823
Max score: 5
Min score: 1
Average score: 3.29
Median score: 3.0


## Sort data

In [65]:
df = df.sort_values(by=['hoursOnHw'])
print("Sorted by time on hw:")
print(df)


Sorted by time on hw:
    id                name  hoursOnHw  score
70  70     Jaden W. Robert  -1.219653      1
23  23    Colin T. Leonard  -1.068941      1
2    2    Willam T. Willam  -0.952640      1
98  98    Thomas L. Thomas   0.031165      1
10  10      James H. James   0.039333      1
..  ..                 ...        ...    ...
12  12    Fredrick E. Fred   4.275583      5
64  64  Connor E. Fredrick   4.276815      5
63  63   Fredrick J. James   4.285514      5
19  19    Hunter J. Hunter   4.301619      5
60  60   Leonard H. Hunter   4.322523      5

[100 rows x 4 columns]


In [67]:
df = df.sort_values(by=['hoursOnHw'])
print("Sorted by scores:")
print(df)

Sorted by scores:
    id                name  hoursOnHw  score
70  70     Jaden W. Robert  -1.219653      1
23  23    Colin T. Leonard  -1.068941      1
2    2    Willam T. Willam  -0.952640      1
98  98    Thomas L. Thomas   0.031165      1
10  10      James H. James   0.039333      1
..  ..                 ...        ...    ...
12  12    Fredrick E. Fred   4.275583      5
64  64  Connor E. Fredrick   4.276815      5
63  63   Fredrick J. James   4.285514      5
19  19    Hunter J. Hunter   4.301619      5
60  60   Leonard H. Hunter   4.322523      5

[100 rows x 4 columns]


## Merge with another dataframe

In [68]:
yeungdf = pd.DataFrame({'id': [1000], 'name': ['Sean Yeung'], 'hoursOnHw': [10], 'score': [5]})
df = pd.concat([df, yeungdf], ignore_index=True)

print(df)

       id                name  hoursOnHw  score
0      70     Jaden W. Robert  -1.219653      1
1      23    Colin T. Leonard  -1.068941      1
2       2    Willam T. Willam  -0.952640      1
3      98    Thomas L. Thomas   0.031165      1
4      10      James H. James   0.039333      1
..    ...                 ...        ...    ...
96     64  Connor E. Fredrick   4.276815      5
97     63   Fredrick J. James   4.285514      5
98     19    Hunter J. Hunter   4.301619      5
99     60   Leonard H. Hunter   4.322523      5
100  1000          Sean Yeung  10.000000      5

[101 rows x 4 columns]
