## Pandas 

### DataFrame and Series 

In [1]:
import pandas as pd

In [2]:
data = {'fruite': [1, 3, 4, 5], 'veg': [7, 8, 9, 10]}

In [3]:
data

{'fruite': [1, 3, 4, 5], 'veg': [7, 8, 9, 10]}

In [4]:
type(data)

dict

In [6]:
df = pd.DataFrame(data)

In [7]:
df['fruite']

0    1
1    3
2    4
3    5
Name: fruite, dtype: int64

In [8]:
type(df['fruite'])

pandas.core.series.Series

In [9]:
df['veg']

0     7
1     8
2     9
3    10
Name: veg, dtype: int64

In [10]:
df.veg

0     7
1     8
2     9
3    10
Name: veg, dtype: int64

### Reading and Writing File 

In [12]:
df = pd.read_csv('nba.csv')

In [17]:
df.head(2)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0


In [18]:
df.tail(2)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [22]:
df1 = df[['Name', 'Team']]

In [24]:
df1.head()

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics


In [26]:
df = pd.read_csv('nba.csv', usecols=['Name', 'Team'])

In [27]:
df.head()

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics


In [28]:
df.set_index('Name')

Unnamed: 0_level_0,Team
Name,Unnamed: 1_level_1
Avery Bradley,Boston Celtics
Jae Crowder,Boston Celtics
John Holland,Boston Celtics
R.J. Hunter,Boston Celtics
Jonas Jerebko,Boston Celtics
...,...
Shelvin Mack,Utah Jazz
Raul Neto,Utah Jazz
Tibor Pleiss,Utah Jazz
Jeff Withey,Utah Jazz


In [30]:
df.to_csv('df.csv')

### Info, Shape, Duplicated, and Drop 

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 2 columns):
Name    457 non-null object
Team    457 non-null object
dtypes: object(2)
memory usage: 7.3+ KB


In [33]:
df.describe()

Unnamed: 0,Name,Team
count,457,457
unique,457,30
top,Courtney Lee,New Orleans Pelicans
freq,1,19


In [34]:
df.shape

(458, 2)

In [36]:
df.duplicated().sum()

0

In [38]:
df.duplicated(subset=['Team']).sum()

427

In [40]:
df = df.append(df)

In [42]:
df.duplicated().sum()

458

In [43]:
df.drop_duplicates()

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics
...,...,...
453,Shelvin Mack,Utah Jazz
454,Raul Neto,Utah Jazz
455,Tibor Pleiss,Utah Jazz
456,Jeff Withey,Utah Jazz


### Columns 

In [45]:
df.columns

Index(['Name', 'Team'], dtype='object')

In [46]:
df.columns = ['0', '1']

In [48]:
df.columns = ['Name', 'Team']

In [50]:
df.rename(columns={'Name': 'name', 'Team': 'team'})

Unnamed: 0,name,team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics
...,...,...
453,Shelvin Mack,Utah Jazz
454,Raul Neto,Utah Jazz
455,Tibor Pleiss,Utah Jazz
456,Jeff Withey,Utah Jazz


### isnan(), isnull() 

In [51]:
df = pd.read_csv('IMDB-Movie-Data.csv')

In [53]:
df.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')

In [56]:
df.isnull().sum()

Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

In [58]:
df.isna().sum()

Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
Rank                  1000 non-null int64
Title                 1000 non-null object
Genre                 1000 non-null object
Description           1000 non-null object
Director              1000 non-null object
Actors                1000 non-null object
Year                  1000 non-null int64
Runtime (Minutes)     1000 non-null int64
Rating                1000 non-null float64
Votes                 1000 non-null int64
Revenue (Millions)    872 non-null float64
Metascore             936 non-null float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [62]:
df1 = df.dropna()

In [64]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 838 entries, 0 to 999
Data columns (total 12 columns):
Rank                  838 non-null int64
Title                 838 non-null object
Genre                 838 non-null object
Description           838 non-null object
Director              838 non-null object
Actors                838 non-null object
Year                  838 non-null int64
Runtime (Minutes)     838 non-null int64
Rating                838 non-null float64
Votes                 838 non-null int64
Revenue (Millions)    838 non-null float64
Metascore             838 non-null float64
dtypes: float64(3), int64(4), object(5)
memory usage: 85.1+ KB


In [65]:
df.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')

In [67]:
df1 = df.dropna(axis = 1)

In [68]:
df1.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes'],
      dtype='object')

### Imputation 

In [69]:
df.isnull().sum()

Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

In [70]:
df1 = df.fillna(0)

In [71]:
df1.isnull().sum()

Rank                  0
Title                 0
Genre                 0
Description           0
Director              0
Actors                0
Year                  0
Runtime (Minutes)     0
Rating                0
Votes                 0
Revenue (Millions)    0
Metascore             0
dtype: int64

In [73]:
df.fillna('mean').head(2)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65


In [74]:
revenue = df['Revenue (Millions)']
revenue

0      333.13
1      126.46
2      138.12
3      270.32
4      325.02
        ...  
995       NaN
996     17.54
997     58.01
998       NaN
999     19.64
Name: Revenue (Millions), Length: 1000, dtype: float64

In [75]:
mean_values = revenue.mean() 

In [76]:
mean_values

82.95637614678897

In [78]:
revenue = revenue.fillna(mean_values)

In [79]:
df['Revenue (Millions)'] = revenue

In [80]:
df.isnull().sum()

Rank                   0
Title                  0
Genre                  0
Description            0
Director               0
Actors                 0
Year                   0
Runtime (Minutes)      0
Rating                 0
Votes                  0
Revenue (Millions)     0
Metascore             64
dtype: int64

### Lambda Function 

In [81]:
x = lambda a: a*a

In [83]:
x(4)

16

In [84]:
mean_meta = df['Metascore'].mean()

In [85]:
mean_meta

58.98504273504273

In [87]:
meta = df['Metascore'].apply(lambda x: x if x == x else mean_meta)

In [90]:
meta.isnull().sum()

0