# 10 minutes to pandas
# This is a short introduction to pandas, geared mainly for new users. 

In [1]:
# Customarily, we import as follows:

import pandas as pd
import numpy as np

# Object Creation

In [2]:
# Creating a Series by passing a list of values, letting pandas create a default integer index:

series=pd.Series([1,2,3,4,5,np.nan,6,7,np.nan,8,9,10])
series

0      1.0
1      2.0
2      3.0
3      4.0
4      5.0
5      NaN
6      6.0
7      7.0
8      NaN
9      8.0
10     9.0
11    10.0
dtype: float64

In [3]:
# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

dates=pd.date_range('20190115',periods=10)
dates
df=pd.DataFrame(np.random.randn(10,4),index=dates,columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2019-01-15,-0.774074,-0.564973,0.475639,0.296002
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-17,-0.965229,1.543339,0.915906,-0.791821
2019-01-18,1.11301,-0.49522,-1.563506,0.710127
2019-01-19,-1.719382,0.260816,2.300797,0.125875
2019-01-20,-0.510393,0.856301,-0.76211,0.407293
2019-01-21,-0.609241,0.803932,-1.70702,-0.94435
2019-01-22,1.753193,-0.822404,0.035939,1.677046
2019-01-23,1.068097,-0.39129,0.259843,0.214697
2019-01-24,0.373643,-0.294147,-2.261873,0.402984


In [4]:
# Creating a DataFrame by passing a dict of objects that can be converted to series-like.

df2=pd.DataFrame({'A':1,
                  'B':pd.Timestamp('20190118'),
                  'C':pd.Series(2,index=list(range(4))),
                  'D':np.array([3] * 4, dtype='int32'),
                  'E':pd.Categorical(['train','test','train','test']),
                  'F':"Foo"              
                 })
df2                               

Unnamed: 0,A,B,C,D,E,F
0,1,2019-01-18,2,3,train,Foo
1,1,2019-01-18,2,3,test,Foo
2,1,2019-01-18,2,3,train,Foo
3,1,2019-01-18,2,3,test,Foo


In [5]:
# The columns of the resulting DataFrame have different dtypes.

df2.dtypes

A             int64
B    datetime64[ns]
C             int64
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [6]:
df.head(3)

Unnamed: 0,A,B,C,D
2019-01-15,-0.774074,-0.564973,0.475639,0.296002
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-17,-0.965229,1.543339,0.915906,-0.791821


In [7]:
df.tail(2)

Unnamed: 0,A,B,C,D
2019-01-23,1.068097,-0.39129,0.259843,0.214697
2019-01-24,0.373643,-0.294147,-2.261873,0.402984


In [8]:
df.index

DatetimeIndex(['2019-01-15', '2019-01-16', '2019-01-17', '2019-01-18',
               '2019-01-19', '2019-01-20', '2019-01-21', '2019-01-22',
               '2019-01-23', '2019-01-24'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [10]:
# DataFrame.to_numpy() 


# It gives a NumPy representation of the underlying data. 
# Note that this can be an expensive operation when your DataFrame has columns with different data types, 
# which comes down to a fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array,
# while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), 
# pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. 
# This may end up being object, which requires casting every value to a Python object.


df.dtypes
df.to_numpy()

array([[-0.77407441, -0.56497308,  0.47563941,  0.2960017 ],
       [ 1.9683327 , -0.35204344, -0.57262722, -0.73279581],
       [-0.96522899,  1.54333943,  0.91590621, -0.7918215 ],
       [ 1.11301014, -0.49522012, -1.56350576,  0.71012722],
       [-1.71938178,  0.26081573,  2.30079661,  0.12587507],
       [-0.51039278,  0.85630129, -0.76211008,  0.40729275],
       [-0.60924065,  0.80393216, -1.70701964, -0.94434998],
       [ 1.75319312, -0.82240361,  0.03593906,  1.67704558],
       [ 1.06809711, -0.39128972,  0.25984256,  0.21469725],
       [ 0.37364281, -0.29414682, -2.26187339,  0.40298363]])

In [11]:
df2.to_numpy()

array([[1, Timestamp('2019-01-18 00:00:00'), 2, 3, 'train', 'Foo'],
       [1, Timestamp('2019-01-18 00:00:00'), 2, 3, 'test', 'Foo'],
       [1, Timestamp('2019-01-18 00:00:00'), 2, 3, 'train', 'Foo'],
       [1, Timestamp('2019-01-18 00:00:00'), 2, 3, 'test', 'Foo']],
      dtype=object)

In [12]:
# describe() shows a quick statistic summary of your data

df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,0.169796,0.054431,-0.287901,0.136506
std,1.260412,0.774898,1.37364,0.793308
min,-1.719382,-0.822404,-2.261873,-0.94435
25%,-0.732866,-0.469238,-1.363157,-0.518128
50%,-0.068375,-0.323095,-0.268344,0.255349
75%,1.101782,0.668153,0.42169,0.406215
max,1.968333,1.543339,2.300797,1.677046


In [13]:
# Transposing your data

df.T

Unnamed: 0,2019-01-15,2019-01-16,2019-01-17,2019-01-18,2019-01-19,2019-01-20,2019-01-21,2019-01-22,2019-01-23,2019-01-24
A,-0.774074,1.968333,-0.965229,1.11301,-1.719382,-0.510393,-0.609241,1.753193,1.068097,0.373643
B,-0.564973,-0.352043,1.543339,-0.49522,0.260816,0.856301,0.803932,-0.822404,-0.39129,-0.294147
C,0.475639,-0.572627,0.915906,-1.563506,2.300797,-0.76211,-1.70702,0.035939,0.259843,-2.261873
D,0.296002,-0.732796,-0.791821,0.710127,0.125875,0.407293,-0.94435,1.677046,0.214697,0.402984


In [14]:
# Sorting by an axis

df.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D
2019-01-24,0.373643,-0.294147,-2.261873,0.402984
2019-01-23,1.068097,-0.39129,0.259843,0.214697
2019-01-22,1.753193,-0.822404,0.035939,1.677046
2019-01-21,-0.609241,0.803932,-1.70702,-0.94435
2019-01-20,-0.510393,0.856301,-0.76211,0.407293
2019-01-19,-1.719382,0.260816,2.300797,0.125875
2019-01-18,1.11301,-0.49522,-1.563506,0.710127
2019-01-17,-0.965229,1.543339,0.915906,-0.791821
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-15,-0.774074,-0.564973,0.475639,0.296002


In [15]:
# Sorting by values

df.sort_values(by='B',ascending=False)

Unnamed: 0,A,B,C,D
2019-01-17,-0.965229,1.543339,0.915906,-0.791821
2019-01-20,-0.510393,0.856301,-0.76211,0.407293
2019-01-21,-0.609241,0.803932,-1.70702,-0.94435
2019-01-19,-1.719382,0.260816,2.300797,0.125875
2019-01-24,0.373643,-0.294147,-2.261873,0.402984
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-23,1.068097,-0.39129,0.259843,0.214697
2019-01-18,1.11301,-0.49522,-1.563506,0.710127
2019-01-15,-0.774074,-0.564973,0.475639,0.296002
2019-01-22,1.753193,-0.822404,0.035939,1.677046


# Selection by label : df.loc[]

In [16]:
# Getting 

df['A']

2019-01-15   -0.774074
2019-01-16    1.968333
2019-01-17   -0.965229
2019-01-18    1.113010
2019-01-19   -1.719382
2019-01-20   -0.510393
2019-01-21   -0.609241
2019-01-22    1.753193
2019-01-23    1.068097
2019-01-24    0.373643
Freq: D, Name: A, dtype: float64

In [17]:
# Selecting via [], which slices the rows.

df[0:3]

Unnamed: 0,A,B,C,D
2019-01-15,-0.774074,-0.564973,0.475639,0.296002
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-17,-0.965229,1.543339,0.915906,-0.791821


In [18]:
df['20190115':'20190120']

Unnamed: 0,A,B,C,D
2019-01-15,-0.774074,-0.564973,0.475639,0.296002
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-17,-0.965229,1.543339,0.915906,-0.791821
2019-01-18,1.11301,-0.49522,-1.563506,0.710127
2019-01-19,-1.719382,0.260816,2.300797,0.125875
2019-01-20,-0.510393,0.856301,-0.76211,0.407293


In [19]:
# For getting a cross section using a label

df.loc[dates[0]]

A   -0.774074
B   -0.564973
C    0.475639
D    0.296002
Name: 2019-01-15 00:00:00, dtype: float64

In [20]:
# Selecting on a multi-axis by label

df.loc[:,['A','B']]

Unnamed: 0,A,B
2019-01-15,-0.774074,-0.564973
2019-01-16,1.968333,-0.352043
2019-01-17,-0.965229,1.543339
2019-01-18,1.11301,-0.49522
2019-01-19,-1.719382,0.260816
2019-01-20,-0.510393,0.856301
2019-01-21,-0.609241,0.803932
2019-01-22,1.753193,-0.822404
2019-01-23,1.068097,-0.39129
2019-01-24,0.373643,-0.294147


In [21]:
# Showing label slicing, both endpoints are included

df.loc['20190117':'20190120',['A','B']]

Unnamed: 0,A,B
2019-01-17,-0.965229,1.543339
2019-01-18,1.11301,-0.49522
2019-01-19,-1.719382,0.260816
2019-01-20,-0.510393,0.856301


In [22]:
# Reduction in the dimensions of the returned object

df.loc['20190117',['A','B']]

A   -0.965229
B    1.543339
Name: 2019-01-17 00:00:00, dtype: float64

In [23]:
# For getting a scalar value:

df.loc['20190115','A']

-0.7740744051628975

In [24]:
# Selection by position

df.iloc[3]

A    1.113010
B   -0.495220
C   -1.563506
D    0.710127
Name: 2019-01-18 00:00:00, dtype: float64

In [25]:
df.iloc[2:5,0:2]

Unnamed: 0,A,B
2019-01-17,-0.965229,1.543339
2019-01-18,1.11301,-0.49522
2019-01-19,-1.719382,0.260816


In [26]:
df.iloc[[1,2,5],[1,3]]

Unnamed: 0,B,D
2019-01-16,-0.352043,-0.732796
2019-01-17,1.543339,-0.791821
2019-01-20,0.856301,0.407293


In [27]:
df.iloc[2:5,:]

Unnamed: 0,A,B,C,D
2019-01-17,-0.965229,1.543339,0.915906,-0.791821
2019-01-18,1.11301,-0.49522,-1.563506,0.710127
2019-01-19,-1.719382,0.260816,2.300797,0.125875


In [28]:
df.iloc[1,1]

-0.35204344063496823

In [29]:
# Boolean indexing

df[df.B<0]

Unnamed: 0,A,B,C,D
2019-01-15,-0.774074,-0.564973,0.475639,0.296002
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-18,1.11301,-0.49522,-1.563506,0.710127
2019-01-22,1.753193,-0.822404,0.035939,1.677046
2019-01-23,1.068097,-0.39129,0.259843,0.214697
2019-01-24,0.373643,-0.294147,-2.261873,0.402984


In [30]:
df[df>0]

Unnamed: 0,A,B,C,D
2019-01-15,,,0.475639,0.296002
2019-01-16,1.968333,,,
2019-01-17,,1.543339,0.915906,
2019-01-18,1.11301,,,0.710127
2019-01-19,,0.260816,2.300797,0.125875
2019-01-20,,0.856301,,0.407293
2019-01-21,,0.803932,,
2019-01-22,1.753193,,0.035939,1.677046
2019-01-23,1.068097,,0.259843,0.214697
2019-01-24,0.373643,,,0.402984


In [31]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2019-01-18,2,3,train,Foo
1,1,2019-01-18,2,3,test,Foo
2,1,2019-01-18,2,3,train,Foo
3,1,2019-01-18,2,3,test,Foo


In [32]:
df

Unnamed: 0,A,B,C,D
2019-01-15,-0.774074,-0.564973,0.475639,0.296002
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796
2019-01-17,-0.965229,1.543339,0.915906,-0.791821
2019-01-18,1.11301,-0.49522,-1.563506,0.710127
2019-01-19,-1.719382,0.260816,2.300797,0.125875
2019-01-20,-0.510393,0.856301,-0.76211,0.407293
2019-01-21,-0.609241,0.803932,-1.70702,-0.94435
2019-01-22,1.753193,-0.822404,0.035939,1.677046
2019-01-23,1.068097,-0.39129,0.259843,0.214697
2019-01-24,0.373643,-0.294147,-2.261873,0.402984


In [33]:
df2=df.copy()
df2['E']=['A','B','C','D','E','A','B','C','D','E']
df2

Unnamed: 0,A,B,C,D,E
2019-01-15,-0.774074,-0.564973,0.475639,0.296002,A
2019-01-16,1.968333,-0.352043,-0.572627,-0.732796,B
2019-01-17,-0.965229,1.543339,0.915906,-0.791821,C
2019-01-18,1.11301,-0.49522,-1.563506,0.710127,D
2019-01-19,-1.719382,0.260816,2.300797,0.125875,E
2019-01-20,-0.510393,0.856301,-0.76211,0.407293,A
2019-01-21,-0.609241,0.803932,-1.70702,-0.94435,B
2019-01-22,1.753193,-0.822404,0.035939,1.677046,C
2019-01-23,1.068097,-0.39129,0.259843,0.214697,D
2019-01-24,0.373643,-0.294147,-2.261873,0.402984,E


In [34]:
df2[df2['E'].isin(['A','E'])]

Unnamed: 0,A,B,C,D,E
2019-01-15,-0.774074,-0.564973,0.475639,0.296002,A
2019-01-19,-1.719382,0.260816,2.300797,0.125875,E
2019-01-20,-0.510393,0.856301,-0.76211,0.407293,A
2019-01-24,0.373643,-0.294147,-2.261873,0.402984,E


# Volvoline Test

In [35]:
i=0
while i<3:
    print(i)
    i+=1
else:
        print(0)

0
1
2
0


In [36]:
data=50
try:
    data=data/10
except ZeroDivisionError:
    print('not 0',end="")
finally:
    print("orrkforpeek",end="")



                          
    

orrkforpeek

In [37]:
tuple={}
tuple[(1,2,4)]=8
tuple[(4,2,1)]=10
tuple[(1,2)]=12
_sum=0
for k in tuple:
    _sum+=tuple[k]
    print(len(tuple)+_sum)

11
21
33


In [53]:
list1=[1,2,3,4,5,6,7,8,9,10]

import statistics
mean=statistics.mean(list1)

pred=[1.2,2.5,2.9,4.2,5.5,6,7.1,8.3,9,10.10]

import numpy as np
SSE=sum((np.array(pred) - np.array(list1))**2)
SST=sum((mean-np.array(list1))**2)
RSquared=1-(SSE/SST)
RSquared*100

99.15151515151514

#  99.15% accuracy