In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
data=pd.read_csv("nba.csv")

In [3]:
data.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [4]:
data.shape

(458, 9)

In [5]:
data.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [6]:
data.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [7]:
data.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [8]:
# filling null VALUES WITH MODE
for i in range(len(data.columns)):
    currcol=data.columns[i]
    mode=data[currcol].mode()[0]
    data[currcol].fillna(mode,inplace=True)

In [9]:
data["Height"].value_counts()

Height
6-9     60
6-10    47
6-7     45
6-8     43
6-6     42
6-11    40
6-3     33
6-5     32
6-4     29
7-0     27
6-1     16
6-2     16
6-0     10
7-1      7
7-3      4
5-11     3
7-2      3
5-9      1
Name: count, dtype: int64

In [10]:
heightgroup=data.groupby(data["Height"])

In [11]:
heightgroup.get_group("6-0")

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
47,Isaiah Canaan,Philadelphia 76ers,0.0,PG,25.0,6-0,201.0,Murray State,947276.0
57,Ish Smith,Philadelphia 76ers,1.0,PG,27.0,6-0,175.0,Wake Forest,947276.0
67,Kyle Lowry,Toronto Raptors,7.0,PG,30.0,6-0,205.0,Villanova,12000000.0
100,Chris Paul,Los Angeles Clippers,3.0,PG,31.0,6-0,175.0,Wake Forest,21468695.0
142,Darren Collison,Sacramento Kings,7.0,PG,28.0,6-0,175.0,UCLA,5013559.0
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0
228,J.J. Barea,Dallas Mavericks,5.0,PG,31.0,6-0,185.0,Northeastern,4290000.0
305,Patty Mills,San Antonio Spurs,8.0,PG,27.0,6-0,185.0,Saint Mary's,3578947.0
384,D.J. Augustin,Denver Nuggets,12.0,PG,28.0,6-0,183.0,Texas,3000000.0
394,Jameer Nelson,Denver Nuggets,1.0,PG,34.0,6-0,190.0,Saint Joseph's,4345000.0


In [12]:
heightgroup.get_group("6-0")["Salary"].mean()

5784075.3

In [13]:
data["Age"].value_counts()

Age
24.0    48
25.0    45
27.0    41
23.0    41
26.0    36
28.0    31
30.0    31
29.0    28
22.0    26
31.0    22
20.0    19
21.0    19
33.0    14
32.0    13
34.0    10
36.0    10
35.0     9
37.0     4
38.0     4
40.0     3
39.0     2
19.0     2
Name: count, dtype: int64

In [14]:
bins=[15,20,25,30,35,40,45]
labels=["15-19","20-24","25-29","30-34","35-39","40-45"]
data["AgeGroup"]=pd.cut(data["Age"],bins=bins,labels=labels,right=False)

In [15]:
data

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,AgeGroup
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,25-29
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,25-29
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,947276.0,25-29
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,20-24
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,Kentucky,5000000.0,25-29
...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,25-29
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,Kentucky,900000.0,20-24
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,Kentucky,2900000.0,25-29
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,25-29


In [16]:
data["AgeGroup"].value_counts()

AgeGroup
25-29    181
20-24    153
30-34     90
35-39     29
40-45      3
15-19      2
Name: count, dtype: int64

In [17]:
agegroup=data.groupby(data["AgeGroup"])

  agegroup=data.groupby(data["AgeGroup"])


In [18]:
agegroup.get_group("15-19")

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,AgeGroup
122,Devin Booker,Phoenix Suns,1.0,SG,19.0,6-6,206.0,Kentucky,2127840.0,15-19
226,Rashad Vaughn,Milwaukee Bucks,20.0,SG,19.0,6-6,202.0,UNLV,1733040.0,15-19


In [19]:
agegroup.get_group("15-19")["Salary"]

122    2127840.0
226    1733040.0
Name: Salary, dtype: float64

In [20]:
agegroup.get_group("15-19")["Salary"].mean()

1930440.0

In [21]:
agegroup.get_group("25-29")["Salary"].median()

3425510.0

In [22]:
list_of_salaries = list(agegroup['Salary'])
list_of_salaries

[('15-19',
  122    2127840.0
  226    1733040.0
  Name: Salary, dtype: float64),
 ('20-24',
  3      1148640.0
  6      1170960.0
  8      1824360.0
  9      3431040.0
  10     2569260.0
           ...    
  447    1175880.0
  449    1348440.0
  452    2239800.0
  454     900000.0
  457     947276.0
  Name: Salary, Length: 153, dtype: float64),
 ('25-29',
  0       7730337.0
  1       6796117.0
  2        947276.0
  4       5000000.0
  5      12000000.0
            ...    
  450     2050000.0
  451      981348.0
  453     2433333.0
  455     2900000.0
  456      947276.0
  Name: Salary, Length: 181, dtype: float64),
 ('30-34',
  19      6300000.0
  30      8000000.0
  31      1635476.0
  33     22875000.0
  34      7402812.0
            ...    
  405    12100000.0
  415     3135000.0
  421     3344000.0
  434     5016000.0
  440     2854940.0
  Name: Salary, Length: 90, dtype: float64),
 ('35-39',
  46       947276.0
  72      2900000.0
  93      5675000.0
  101     3376000.0
  102   


Assignment 3 Part 2


In [23]:
dataset=pd.read_csv("Iris.csv")

In [24]:
dataset.describe

<bound method NDFrame.describe of       Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0      1            5.1           3.5            1.4           0.2   
1      2            4.9           3.0            1.4           0.2   
2      3            4.7           3.2            1.3           0.2   
3      4            4.6           3.1            1.5           0.2   
4      5            5.0           3.6            1.4           0.2   
..   ...            ...           ...            ...           ...   
145  146            6.7           3.0            5.2           2.3   
146  147            6.3           2.5            5.0           1.9   
147  148            6.5           3.0            5.2           2.0   
148  149            6.2           3.4            5.4           2.3   
149  150            5.9           3.0            5.1           1.8   

            Species  
0       Iris-setosa  
1       Iris-setosa  
2       Iris-setosa  
3       Iris-setosa  
4       Iris-se

In [25]:
dataset.shape

(150, 6)

In [26]:
dataset.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [27]:
dataset.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [28]:
dataset

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [29]:


def basic_statistics(df):
    print("\nIris-setosa")
    print(df[df.Species=='Iris-setosa'].describe().transpose())

    print("\nIris-versicolor")
    print(df[df.Species=='Iris-versicolor'].describe().transpose())

    print("\nIris-virginica")
    print(df[df.Species=='Iris-virginica'].describe().transpose())
    
basic_statistics(dataset)    




Iris-setosa
               count    mean        std  min     25%   50%     75%   max
Id              50.0  25.500  14.577380  1.0  13.250  25.5  37.750  50.0
SepalLengthCm   50.0   5.006   0.352490  4.3   4.800   5.0   5.200   5.8
SepalWidthCm    50.0   3.418   0.381024  2.3   3.125   3.4   3.675   4.4
PetalLengthCm   50.0   1.464   0.173511  1.0   1.400   1.5   1.575   1.9
PetalWidthCm    50.0   0.244   0.107210  0.1   0.200   0.2   0.300   0.6

Iris-versicolor
               count    mean        std   min     25%    50%    75%    max
Id              50.0  75.500  14.577380  51.0  63.250  75.50  87.75  100.0
SepalLengthCm   50.0   5.936   0.516171   4.9   5.600   5.90   6.30    7.0
SepalWidthCm    50.0   2.770   0.313798   2.0   2.525   2.80   3.00    3.4
PetalLengthCm   50.0   4.260   0.469911   3.0   4.000   4.35   4.60    5.1
PetalWidthCm    50.0   1.326   0.197753   1.0   1.200   1.30   1.50    1.8

Iris-virginica
               count     mean        std    min      25%     50%  