## Pandas CSV Practise 

In [6]:
import pandas as pd

from io import StringIO

data = ('col1,col2,col3\n'
            'a,b,1\n'
            'a,b,2\n'
            'c,d,3')
#print(data)
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [18]:
#usecols : list-like or callable, default:None

pd.read_csv(StringIO(data), usecols=lambda x: x in ['col1', 'col3'])    #Callable
#pd.read_csv(StringIO(data), usecols=['col1', 'col3'])                  #List-like
#pd.read_csv(StringIO(data), usecols=lambda x: x in ['col3', 'col1'])   #Element order is ignored in ['col3','col1']

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [23]:
#  skiprows: list-like or integer, default:None
#  Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file.

#  If callable, the callable function will be evaluated against the row indices,
# ,returning True if the row should be skipped and False otherwise.

pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)    #Callable
#pd.read_csv(StringIO(data), skiprows=[1,3])                  #list-like
#pd.read_csv(StringIO(data), skiprows=1)                      #Integer

Unnamed: 0,col1,col2,col3
0,a,b,2


In [44]:
import numpy as np
data=('a,b,c,d\n'
        '1,2,3,4\n'
        '5,6,7,8\n'
        '9,10,11')

#pd.read_csv(data)    #FileNotFoundError: File b'a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11' does not exist

res=pd.read_csv(StringIO(data))
print(res)
print("\nAccessing first value in first column:",res['a'][0])
print("\nData type of each column:\n",res.dtypes)

df = pd.read_csv(StringIO(data),dtype={'b': object, 'c': np.float64})
print(print("\nUpdated Data type of each column:\n",df.dtypes))

   a   b   c    d
0  1   2   3  4.0
1  5   6   7  8.0
2  9  10  11  NaN

Accessing first value in first column: 1

Data type of each column:
 a      int64
b      int64
c      int64
d    float64
dtype: object

Updated Data type of each column:
 a      int64
b     object
c    float64
d    float64
dtype: object
None


In [72]:
data = ("col_1\n"
            "1\n"
            "2\n"
            "'A'\n"
            "4.22")

df = pd.read_csv(StringIO(data),converters={'col_1': str})
print("\ndf:\n",df)
#converters is a dictionary of functions for converting values in certain columns.
print("\nType of df:\n",df.dtypes)

#df1 = pd.read_csv(StringIO(data),converters={'col_1': float})   #ValueError: could not convert string to float: "'A'"

df2 = pd.read_csv(StringIO(data))

#Therefore,we can use the to_numeric() function to coerce the dtypes after reading in the data,
#,which will convert all valid parsing to floats, leaving the invalid parsing as NaN.
df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')

print("\ndf2:\n",df2)
print("\nApply Type to df2:\n",df2['col_1'].apply(type))

print()

print(df['col_1'].apply(type).value_counts())
#The value_counts() function is used to get a Series containing counts of unique values.

print()
print("\nApply Type to df:\n",df['col_1'].apply(type))
print()

print(pd.read_csv(StringIO(data)).dtypes)
print()

df4=pd.read_csv(StringIO(data), names=['foo'],header=0)  #changing column name
print("\ndf4:\n",df4)


df:
   col_1
0     1
1     2
2   'A'
3  4.22

Type of df:
 col_1    object
dtype: object

df2:
    col_1
0   1.00
1   2.00
2    NaN
3   4.22

Apply Type to df2:
 0    <class 'float'>
1    <class 'float'>
2    <class 'float'>
3    <class 'float'>
Name: col_1, dtype: object

<class 'str'>    4
Name: col_1, dtype: int64


Apply Type to df:
 0    <class 'str'>
1    <class 'str'>
2    <class 'str'>
3    <class 'str'>
Name: col_1, dtype: object

col_1    object
dtype: object


df4:
     foo
0     1
1     2
2   'A'
3  4.22


In [75]:
data = ('a,b,c\n'
            '\n'
            '1,2,3\n'
            '\n'
            '\n'
            '4,5,6') 
pd.read_csv(StringIO(data), skip_blank_lines=False)

Unnamed: 0,a,b,c
0,,,
1,1.0,2.0,3.0
2,,,
3,,,
4,4.0,5.0,6.0


In [84]:
data=pd.read_csv("Documents/details.csv")
frame=pd.DataFrame(data)
frame

Unnamed: 0,Name RollNo
0,sakshi 1
1,Heena 2
2,Sneha 3


In [93]:
data=pd.read_csv("Documents/sale.csv", error_bad_lines=False)
frame=pd.DataFrame(data)
frame

b'Skipping line 3: expected 2 fields, saw 3\n'


Unnamed: 0,Number	Customer Name	2016 2017 Percent Growth	Jan Units	Month	Day	Year	Active
0\t10002.0\tQuest Industries\t$125,000.00\t$162500.00\t30.00%\t500\t1\t10\t2015\tY
2\t23477.0\tACME Industrial\t$50,000.00\t$62500.00\t25.00%\t125\t3\t29\t2016\tY
3\t24900.0\tBrekke LTD\t$350,000.00\t$490000.00\t4.00%\t75\t10\t27\t2015\tY
4\t651029.0\tHarbor Co\t$15,000.00\t$12750.00\t-15.00%\tClosed\t2\t2\t2014\tN


### USbirths CSV file

In [122]:
#Reading data from CSV in DataFrame.
data=pd.read_csv("USbirths.csv",header=None)
print("Type of data:",type(data))
frame=pd.DataFrame(data)
print("Data Frame:\n",frame)
print("Type of frame:",type(frame))
print("Data type:",frame.dtypes)
print("\nTo fetch first 5 records:\n",data.head())
print("\nTo fetch last 5 records:\n",data.tail())
print("\nTo fetch first 3 records:\n",data.head(3))

Type of data: <class 'pandas.core.frame.DataFrame'>
Data Frame:
                   0
0   1994,1,1,6,8096
1   1994,1,2,7,7772
2  1994,1,3,1,10142
3  1994,1,4,2,11248
4  1994,1,5,3,11053
5  1994,1,6,4,11406
6  1994,1,7,5,11251
7   1994,1,8,6,8653
Type of frame: <class 'pandas.core.frame.DataFrame'>
Data type: 0    object
dtype: object

To fetch first 5 records:
                   0
0   1994,1,1,6,8096
1   1994,1,2,7,7772
2  1994,1,3,1,10142
3  1994,1,4,2,11248
4  1994,1,5,3,11053

To fetch last 5 records:
                   0
3  1994,1,4,2,11248
4  1994,1,5,3,11053
5  1994,1,6,4,11406
6  1994,1,7,5,11251
7   1994,1,8,6,8653

To fetch first 3 records:
                   0
0   1994,1,1,6,8096
1   1994,1,2,7,7772
2  1994,1,3,1,10142


In [137]:
print("Describe:\n",data.describe()) #generate descriptive statistics (for numeric columns only)
print("\nMax Value:\n",data.max())   #returns max value for all numeric columns
print("\nMin Value:\n",data.min())   #returns min value for all numeric columns
print("\nRandom Sample:\n",data.sample(2))  #returns a random sample of data frame
#print(data.dropna())

#Selecting multiple rows and columns from a Pandas dataframe:
print("\nFetching 0th row and all columns:\n",data.loc[0,:])
print("\nFetching 3 rows and all columns:\n",data.loc[[0,1,2],:])
print("\nFetching 3 rows and all columns:\n",data.loc[0:2])  #we can skip column values if we need all columns

Describe:
                        0
count                  8
unique                 8
top     1994,1,3,1,10142
freq                   1

Max Value:
 0    1994,1,8,6,8653
dtype: object

Min Value:
 0    1994,1,1,6,8096
dtype: object

Random Sample:
                   0
5  1994,1,6,4,11406
6  1994,1,7,5,11251

Fetching 0th row and all columns:
 0    1994,1,1,6,8096
Name: 0, dtype: object

Fetching 3 rows and all columns:
                   0
0   1994,1,1,6,8096
1   1994,1,2,7,7772
2  1994,1,3,1,10142

Fetching 0th row and all columns:
                   0
0   1994,1,1,6,8096
1   1994,1,2,7,7772
2  1994,1,3,1,10142
