## Data Processing

> ### Data Operations

In [1]:
import numpy as np
a = np.array([[1,2],[3,4]])
print(a)

[[1 2]
 [3 4]]


In [2]:
# ndmin mean size of array 
a = np.array([1,2,3,4,5], ndmin=3)
print(a)

[[[1 2 3 4 5]]]


In [3]:
a = np.array([1,2,3], dtype=complex)
b = np.array([1,2,3], dtype=float)

print(a)
print(b)


[1.+0.j 2.+0.j 3.+0.j]
[1. 2. 3.]


In [4]:
import pandas as pd
data = np.array(['a','b','c','d','e'])
s = pd.Series(data)
print(s)

0    a
1    b
2    c
3    d
4    e
dtype: object


In [5]:
data = {'Name':['Tom','Jerry','Jack','Oggy'],'Age':[24,34,54,12]}
df = pd.DataFrame(data,index=['rank1','rank2','rank3','rank4'])
print(df)

        Name  Age
rank1    Tom   24
rank2  Jerry   34
rank3   Jack   54
rank4   Oggy   12


In [6]:
# Panel Module Remove From new Version of Pandas

# data = { 'Item1': pd.DataFrame(np.random.randn(4,3)),
#          'Item2': pd.DataFrame(np.random.randn(4,2))
# }
# p = pd.Panel(data)
# print(p)

> ### Data Cleansing

In [7]:
import pandas as pd
import numpy as np

print(np.random.randn(5,3))
df = pd.DataFrame(np.random.randn(5,3), index=['a','c','e','f','h'], columns=['one', 'two', 'three'])

df = df.reindex(['a','b','c','d','e','f','g','h'])

print(df)

[[-0.0801828  -1.58879849  1.80885461]
 [-0.36140808 -0.63402728 -0.93015091]
 [-0.34730585  0.35092776  0.82125395]
 [ 0.1745353  -0.57768879 -0.84077988]
 [ 0.2505146   1.13967294 -0.97283411]]
        one       two     three
a -0.541525  1.161069  0.024134
b       NaN       NaN       NaN
c -0.489007 -1.682834  0.613665
d       NaN       NaN       NaN
e  0.212142 -0.949780 -0.113359
f  0.516374 -0.451251 -0.428413
g       NaN       NaN       NaN
h  0.187758  2.934563  1.049456


In [8]:
df.isnull()

Unnamed: 0,one,two,three
a,False,False,False
b,True,True,True
c,False,False,False
d,True,True,True
e,False,False,False
f,False,False,False
g,True,True,True
h,False,False,False


- fillna function fill the NAN value to respected add as parameter in Function

In [9]:
# df.fillna(0)

- fillna function we can add method='pad' or method='fill' it's mean add forward value

In [10]:
# print(df.fillna(method='pad'))

- fillna function we can add method='bfill' or method='backfill' it's mean add backward value

In [11]:
print(df.fillna(method='bfill'))

        one       two     three
a -0.541525  1.161069  0.024134
b -0.489007 -1.682834  0.613665
c -0.489007 -1.682834  0.613665
d  0.212142 -0.949780 -0.113359
e  0.212142 -0.949780 -0.113359
f  0.516374 -0.451251 -0.428413
g  0.187758  2.934563  1.049456
h  0.187758  2.934563  1.049456


- dropna functin use delete all null value

In [12]:
df.dropna()

Unnamed: 0,one,two,three
a,-0.541525,1.161069,0.024134
c,-0.489007,-1.682834,0.613665
e,0.212142,-0.94978,-0.113359
f,0.516374,-0.451251,-0.428413
h,0.187758,2.934563,1.049456


- replace function use for repalce value 

In [13]:
df  = pd.DataFrame({
    'One':[10,20,30,40,50,2000],
    'Two':[1000,0,10,23,45,78]
})
df.replace({2000:1234, 1000:9870})

Unnamed: 0,One,Two
0,10,9870
1,20,0
2,30,10
3,40,23
4,50,45
5,1234,78


> ### Processing CSV Data

In [14]:
import pandas as pd

data = pd.read_csv('top_youtube_channel_data.csv')
data

Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
0,1,T-Series,213000000,188073919029,16708.0,Music,2006
1,2,YouTube Movies,150000000,167122746349,,Film & Animation,2015
2,3,Cocomelon - Nursery Rhymes,133000000,126822520940,751.0,Education,2006
3,4,SET India,131000000,101541977714,78334.0,Shows,2006
4,5,Music,116000000,78437871689,,Music,2013
...,...,...,...,...,...,...,...
95,96,Markiplier,32600000,18011837263,5129.0,Gaming,2012
96,97,Like Nastya ESP,32600000,15144858210,584.0,Entertainment,2017
97,98,Ryan's World,32400000,51312603726,2155.0,Entertainment,2015
98,99,ABP News,32300000,9850740503,209351.0,People & Blogs,2012


In [15]:
data.head()

Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
0,1,T-Series,213000000,188073919029,16708.0,Music,2006
1,2,YouTube Movies,150000000,167122746349,,Film & Animation,2015
2,3,Cocomelon - Nursery Rhymes,133000000,126822520940,751.0,Education,2006
3,4,SET India,131000000,101541977714,78334.0,Shows,2006
4,5,Music,116000000,78437871689,,Music,2013


In [16]:
data.tail()

Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
95,96,Markiplier,32600000,18011837263,5129.0,Gaming,2012
96,97,Like Nastya ESP,32600000,15144858210,584.0,Entertainment,2017
97,98,Ryan's World,32400000,51312603726,2155.0,Entertainment,2015
98,99,ABP News,32300000,9850740503,209351.0,People & Blogs,2012
99,100,Desi Music Factory,32200000,9115577588,122.0,Music,2014


In [17]:
data.head()['category']

0               Music 
1    Film & Animation 
2           Education 
3               Shows 
4               Music 
Name: category, dtype: object

In [18]:
print(data.loc[:,['rank','category']].head())

   rank           category
0     1             Music 
1     2  Film & Animation 
2     3         Education 
3     4             Shows 
4     5             Music 


In [19]:
data.loc[1:10,['rank','category']]

Unnamed: 0,rank,category
1,2,Film & Animation
2,3,Education
3,4,Shows
4,5,Music
5,6,Gaming
6,7,Entertainment
7,8,People & Blogs
8,9,Gaming
9,10,People & Blogs
10,11,Sports


> ### Processing JSON Data

In [20]:
import pandas as pd

data = pd.read_json('processing_json_data.json')
data

Unnamed: 0,ID,Name,Salary,StartDate,Dept
0,1,Rick,623.3,1/1/2012,IT
1,2,Dan,515.2,9/23/2013,Operations
2,3,Michelle,611.0,11/15/2014,IT
3,4,Ryan,729.0,5/11/2014,HR
4,5,Gary,843.25,3/27/2015,Finance
5,6,Nina,578.0,5/21/2013,IT
6,7,Simon,632.8,7/30/2013,Operations
7,8,Guru,722.5,6/17/2014,Finance


In [21]:
data.loc[:, ['Salary', 'Dept']]

Unnamed: 0,Salary,Dept
0,623.3,IT
1,515.2,Operations
2,611.0,IT
3,729.0,HR
4,843.25,Finance
5,578.0,IT
6,632.8,Operations
7,722.5,Finance


In [22]:
data.loc[[1,3,4],['Salary','Dept']]

Unnamed: 0,Salary,Dept
1,515.2,Operations
3,729.0,HR
4,843.25,Finance


In [23]:
data.to_json(orient="records")

'[{"ID":1,"Name":"Rick","Salary":623.3,"StartDate":"1\\/1\\/2012","Dept":"IT"},{"ID":2,"Name":"Dan","Salary":515.2,"StartDate":"9\\/23\\/2013","Dept":"Operations"},{"ID":3,"Name":"Michelle","Salary":611.0,"StartDate":"11\\/15\\/2014","Dept":"IT"},{"ID":4,"Name":"Ryan","Salary":729.0,"StartDate":"5\\/11\\/2014","Dept":"HR"},{"ID":5,"Name":"Gary","Salary":843.25,"StartDate":"3\\/27\\/2015","Dept":"Finance"},{"ID":6,"Name":"Nina","Salary":578.0,"StartDate":"5\\/21\\/2013","Dept":"IT"},{"ID":7,"Name":"Simon","Salary":632.8,"StartDate":"7\\/30\\/2013","Dept":"Operations"},{"ID":8,"Name":"Guru","Salary":722.5,"StartDate":"6\\/17\\/2014","Dept":"Finance"}]'

In [24]:
data.to_json(orient='records', lines=True)

'{"ID":1,"Name":"Rick","Salary":623.3,"StartDate":"1\\/1\\/2012","Dept":"IT"}\n{"ID":2,"Name":"Dan","Salary":515.2,"StartDate":"9\\/23\\/2013","Dept":"Operations"}\n{"ID":3,"Name":"Michelle","Salary":611.0,"StartDate":"11\\/15\\/2014","Dept":"IT"}\n{"ID":4,"Name":"Ryan","Salary":729.0,"StartDate":"5\\/11\\/2014","Dept":"HR"}\n{"ID":5,"Name":"Gary","Salary":843.25,"StartDate":"3\\/27\\/2015","Dept":"Finance"}\n{"ID":6,"Name":"Nina","Salary":578.0,"StartDate":"5\\/21\\/2013","Dept":"IT"}\n{"ID":7,"Name":"Simon","Salary":632.8,"StartDate":"7\\/30\\/2013","Dept":"Operations"}\n{"ID":8,"Name":"Guru","Salary":722.5,"StartDate":"6\\/17\\/2014","Dept":"Finance"}\n'

> ### Processing XLS Data

- *convert csv to excel*

In [25]:
import pandas as pd

data = pd.read_csv("top_youtube_channel_data.csv")
data.to_excel("top_youtube_channel_data.xlsx", index=None, header=True)

- *Reading Excel File*

In [26]:
data = pd.read_excel("top_youtube_channel_data.xlsx")
data

Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
0,1,T-Series,213000000,188073919029,16708.0,Music,2006
1,2,YouTube Movies,150000000,167122746349,,Film & Animation,2015
2,3,Cocomelon - Nursery Rhymes,133000000,126822520940,751.0,Education,2006
3,4,SET India,131000000,101541977714,78334.0,Shows,2006
4,5,Music,116000000,78437871689,,Music,2013
...,...,...,...,...,...,...,...
95,96,Markiplier,32600000,18011837263,5129.0,Gaming,2012
96,97,Like Nastya ESP,32600000,15144858210,584.0,Entertainment,2017
97,98,Ryan's World,32400000,51312603726,2155.0,Entertainment,2015
98,99,ABP News,32300000,9850740503,209351.0,People & Blogs,2012


In [27]:
data.loc[[1,2,3,5,6,7], ['youtuber','category']]

Unnamed: 0,youtuber,category
1,YouTube Movies,Film & Animation
2,Cocomelon - Nursery Rhymes,Education
3,SET India,Shows
5,PewDiePie,Gaming
6,MrBeast,Entertainment
7,Kids Diana Show,People & Blogs


In [28]:
import pandas as pd 
with pd.ExcelFile('top_youtube_channel_data.xlsx') as xls:
    df1 = pd.read_excel(xls, 'Sheet1')
    # df2 = pd.read_excel(xls, 'Sheet2')

df1
# try:
#     df2
# except:
#     print("Not a Sheet2")

Unnamed: 0,rank,youtuber,subscribers,video views,video count,category,started
0,1,T-Series,213000000,188073919029,16708.0,Music,2006
1,2,YouTube Movies,150000000,167122746349,,Film & Animation,2015
2,3,Cocomelon - Nursery Rhymes,133000000,126822520940,751.0,Education,2006
3,4,SET India,131000000,101541977714,78334.0,Shows,2006
4,5,Music,116000000,78437871689,,Music,2013
...,...,...,...,...,...,...,...
95,96,Markiplier,32600000,18011837263,5129.0,Gaming,2012
96,97,Like Nastya ESP,32600000,15144858210,584.0,Entertainment,2017
97,98,Ryan's World,32400000,51312603726,2155.0,Entertainment,2015
98,99,ABP News,32300000,9850740503,209351.0,People & Blogs,2012


> ### *Relational Database*

In [29]:
!pip install sqlalchemy



In [30]:
from sqlalchemy import create_engine
import pandas as pd

data = pd.read_csv("top_youtube_channel_data.csv")

# Create the db engine
engine = create_engine('sqlite:///:memory:')

# Store the DataFrame as table 
data.to_sql('data_table',engine)

# Query 1 on the relational table
res1 = pd.read_sql_query('SELECT * FROM data_table', engine)
print("Result 1")
print(res1)
print(" ")

# Query 2 on the realtional Database
res2 = pd.read_sql_query('SELECT category,youtuber,rank FROM data_table', engine)
print("Result 2")
print(res2)

Result 1
    index  rank                     youtuber  subscribers      video views  \
0       0     1                    T-Series     213000000  188,073,919,029   
1       1     2              YouTube Movies     150000000  167,122,746,349   
2       2     3  Cocomelon - Nursery Rhymes     133000000  126,822,520,940   
3       3     4                   SET India     131000000  101,541,977,714   
4       4     5                       Music     116000000   78,437,871,689   
..    ...   ...                          ...          ...              ...   
95     95    96                  Markiplier      32600000   18,011,837,263   
96     96    97             Like Nastya ESP      32600000   15,144,858,210   
97     97    98                Ryan's World      32400000   51,312,603,726   
98     98    99                    ABP News      32300000    9,850,740,503   
99     99   100          Desi Music Factory      32200000    9,115,577,588   

    video count           category  started   
0      

> ### *NoSQL Database*

In [31]:
!pip install pymongo



In [32]:
from pymongo import MongoClient
from pprint import pprint

# Choose the appropriate client
client = MongoClient()

# Connect to the test db
db = client.test

# Use the Employee Collection
employee = db.employee
employee_details = {
    'Name': 'Dev Darji',
    'Address': 'Shivdhara Residency',
    'Age': '23'
}

# employee.insert_one(employee_details)
# employee.find_one({'Age': '23'})

# Use the insert method
# result = employee.insert_one(employee_details)
# result

# Query for the inserted document.
# queryResult = employee.find_one({'Age':'23'})
# pprint(queryResult)

> ### *Date and Time* 

- *Date Time Representation*
- *Date Time Arithmetic*
- *Date Time Comparison*

>> #### *Date time Representation* 

In [33]:
import datetime

print("Print the date of Today:", datetime.datetime.today())

Print the date of Today: 2022-06-01 21:46:58.757329


In [34]:
date_today = datetime.date.today()
print("Today Date : ",date_today)

Today Date :  2022-06-01


In [35]:
print("This Year : ", date_today.year)

This Year :  2022


In [36]:
print("This Month : ", date_today.month)

This Month :  6


In [37]:
print("This week day",date_today.day)

This week day 1


In [38]:
print("Month Name : ", date_today.strftime('%B'))

Month Name :  June


In [39]:
print("Week Day Name : ", date_today.strftime('%A'))

Week Day Name :  Wednesday


>> ##### *Date Time Arithmetic*

In [40]:
import datetime

# Capture the First Date
day1 = datetime.date(2022, 5, 31)
print("day1 : ", day1.ctime())

day1 :  Tue May 31 00:00:00 2022


In [41]:
day2 = datetime.date(2022, 6, 25)
print("day2 : ", day2.ctime())

day2 :  Sat Jun 25 00:00:00 2022


In [42]:
print("Number of Days : ", day2-day1)

Number of Days :  25 days, 0:00:00


In [43]:
date_today = datetime.date.today()
print(date_today)

2022-06-01


In [44]:
# Create a delta of 4 days
no_of_days = datetime.timedelta(days=4)
print("No of Days", no_of_days)

No of Days 4 days, 0:00:00


In [45]:
before_four_days = date_today -no_of_days
print("Before 4 Days", before_four_days)

Before 4 Days 2022-05-28


In [46]:
after_four_days = date_today + no_of_days
print("After 4 Days", after_four_days)

After 4 Days 2022-06-05


>> ##### *Date Time Comparison*

In [47]:
import datetime

date_today = datetime.date.today()

no_of_days = datetime.timedelta(days=4)

before_four_days = date_today - no_of_days
print("Before Four Days : ", before_four_days)

after_four_days = date_today + no_of_days
print("After Four Days : ", after_four_days)

date1 = datetime.date(2022,5,28)
print("Date1 : ",date1)

if date1 == before_four_days:
    print("Same Date")
if date_today > date1:
    print("Past Date")
if date1 < after_four_days:
    print("Future Days")

Before Four Days :  2022-05-28
After Four Days :  2022-06-05
Date1 :  2022-05-28
Same Date
Past Date
Future Days


> ### Data Wrangling

>> ##### Merging Data

In [48]:
import pandas as pd

left = pd.DataFrame({
    'id':[1,2,3,4,5],
    'Name':['Dev','Ravi','Kiran','Haresh','Mayur'],
    'subject_id':['sub1','sub2','sub3','sub4','sub5']
})

right = pd.DataFrame({
    'id':[1,2,3,4,5],
    'Name':['Darshan','Suhag','Karan','Parth','Raju'],
    'subject_id':['sub1','sub2','sub3','sub4','sub5']
})

print(left)
print(right)

   id    Name subject_id
0   1     Dev       sub1
1   2    Ravi       sub2
2   3   Kiran       sub3
3   4  Haresh       sub4
4   5   Mayur       sub5
   id     Name subject_id
0   1  Darshan       sub1
1   2    Suhag       sub2
2   3    Karan       sub3
3   4    Parth       sub4
4   5     Raju       sub5


>> ##### *Grouping Data*

In [50]:
# import the pandas library
import pandas as pd

ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
    'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
    'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
    'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],
    'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)

grouped = df.groupby('Team')
print(grouped.get_group('Kings'))

    Team  Rank  Year  Points
4  Kings     3  2014     741
6  Kings     1  2016     756
7  Kings     1  2017     788


>> #### *Concatenating Data*

In [51]:
print(pd.concat([left, right]))

   id     Name subject_id
0   1      Dev       sub1
1   2     Ravi       sub2
2   3    Kiran       sub3
3   4   Haresh       sub4
4   5    Mayur       sub5
0   1  Darshan       sub1
1   2    Suhag       sub2
2   3    Karan       sub3
3   4    Parth       sub4
4   5     Raju       sub5


> ### Data Aggregation

In [52]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(10,4),
                  index=pd.date_range('1/1/2000', periods=10),
                  columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2000-01-01,1.277315,-2.191765,-0.960569,2.383179
2000-01-02,-0.280791,-1.00023,-1.27802,0.079329
2000-01-03,0.13642,0.546435,0.862198,-0.453536
2000-01-04,-0.353127,-1.005162,-0.694877,-0.025424
2000-01-05,0.917599,0.485,1.117373,-0.821272
2000-01-06,-0.877615,-1.124511,0.781579,-0.220623
2000-01-07,-0.811726,0.175593,0.470564,0.321021
2000-01-08,1.08963,-1.895296,-0.711523,1.945817
2000-01-09,0.554224,-0.432168,-0.741712,1.641254
2000-01-10,1.336466,0.325694,-1.30363,0.050914


In [57]:
r = df.rolling(window=3, min_periods=1)
print(r)

Rolling [window=3,min_periods=1,center=False,axis=0,method=single]


In [58]:
print(df)
print(r.aggregate(np.sum))

                   A         B         C         D
2000-01-01  1.277315 -2.191765 -0.960569  2.383179
2000-01-02 -0.280791 -1.000230 -1.278020  0.079329
2000-01-03  0.136420  0.546435  0.862198 -0.453536
2000-01-04 -0.353127 -1.005162 -0.694877 -0.025424
2000-01-05  0.917599  0.485000  1.117373 -0.821272
2000-01-06 -0.877615 -1.124511  0.781579 -0.220623
2000-01-07 -0.811726  0.175593  0.470564  0.321021
2000-01-08  1.089630 -1.895296 -0.711523  1.945817
2000-01-09  0.554224 -0.432168 -0.741712  1.641254
2000-01-10  1.336466  0.325694 -1.303630  0.050914
                   A         B         C         D
2000-01-01  1.277315 -2.191765 -0.960569  2.383179
2000-01-02  0.996524 -3.191995 -2.238589  2.462508
2000-01-03  1.132944 -2.645560 -1.376391  2.008971
2000-01-04 -0.497498 -1.458956 -1.110699 -0.399631
2000-01-05  0.700893  0.026274  1.284693 -1.300232
2000-01-06 -0.313143 -1.644673  1.204074 -1.067319
2000-01-07 -0.771742 -0.463918  2.369515 -0.720874
2000-01-08 -0.599711 -2.844214 

In [59]:
print(df)
print(r['A'].aggregate(np.sum))

                   A         B         C         D
2000-01-01  1.277315 -2.191765 -0.960569  2.383179
2000-01-02 -0.280791 -1.000230 -1.278020  0.079329
2000-01-03  0.136420  0.546435  0.862198 -0.453536
2000-01-04 -0.353127 -1.005162 -0.694877 -0.025424
2000-01-05  0.917599  0.485000  1.117373 -0.821272
2000-01-06 -0.877615 -1.124511  0.781579 -0.220623
2000-01-07 -0.811726  0.175593  0.470564  0.321021
2000-01-08  1.089630 -1.895296 -0.711523  1.945817
2000-01-09  0.554224 -0.432168 -0.741712  1.641254
2000-01-10  1.336466  0.325694 -1.303630  0.050914
2000-01-01    1.277315
2000-01-02    0.996524
2000-01-03    1.132944
2000-01-04   -0.497498
2000-01-05    0.700893
2000-01-06   -0.313143
2000-01-07   -0.771742
2000-01-08   -0.599711
2000-01-09    0.832129
2000-01-10    2.980320
Freq: D, Name: A, dtype: float64


In [60]:
print(df)
print(r[['A','B']].aggregate(np.sum))

                   A         B         C         D
2000-01-01  1.277315 -2.191765 -0.960569  2.383179
2000-01-02 -0.280791 -1.000230 -1.278020  0.079329
2000-01-03  0.136420  0.546435  0.862198 -0.453536
2000-01-04 -0.353127 -1.005162 -0.694877 -0.025424
2000-01-05  0.917599  0.485000  1.117373 -0.821272
2000-01-06 -0.877615 -1.124511  0.781579 -0.220623
2000-01-07 -0.811726  0.175593  0.470564  0.321021
2000-01-08  1.089630 -1.895296 -0.711523  1.945817
2000-01-09  0.554224 -0.432168 -0.741712  1.641254
2000-01-10  1.336466  0.325694 -1.303630  0.050914
                   A         B
2000-01-01  1.277315 -2.191765
2000-01-02  0.996524 -3.191995
2000-01-03  1.132944 -2.645560
2000-01-04 -0.497498 -1.458956
2000-01-05  0.700893  0.026274
2000-01-06 -0.313143 -1.644673
2000-01-07 -0.771742 -0.463918
2000-01-08 -0.599711 -2.844214
2000-01-09  0.832129 -2.151871
2000-01-10  2.980320 -2.001771


> ### Reading HTML Pages

- Library known as beautifulsoup. Using this library, we can search for the values of html tags and get specific data like title of the page and the list of headers in the page.


- Install Beautifulsoup

In [63]:
!pip install beautifulsoup4



> *Reading the HTML File*

In [96]:
import requests
import urllib.request as urllib2
from bs4 import BeautifulSoup

# Fetch the html file
url = 'https://www.tutorialspoint.com/python/index.htm'
# site = requests.get(url)
# print(site.status)

res = urllib2.urlopen(url)
html_doc = res.read()
# res.status

In [185]:
# parse the html file
soup = BeautifulSoup(html_doc,'html.parser')

# soup

main_topic = []
card_list = []

body =  soup.find('div', {"class":"card card-body bg-light pt-0"}).find_all('ul', {"class":"toc chapters"})
  
for x in range(len(body)):
    card_list.append(body[x].find_all('li'))


for a in range(len(card_list)):
    for i in range(len(card_list[a])):
        main_topic.append(card_list[a][i].string)


print(main_topic)
print(len(main_topic))

['Python Basic Tutorial', 'Python - Home', 'Python - Overview', 'Python - Environment Setup', 'Python - Basic Syntax', 'Python - Variable Types', 'Python - Basic Operators', 'Python - Decision Making', 'Python - Loops', 'Python - Numbers', 'Python - Strings', 'Python - Lists', 'Python - Tuples', 'Python - Dictionary', 'Python - Date & Time', 'Python - Functions', 'Python - Modules', 'Python - Files I/O', 'Python - Exceptions', 'Python Advanced Tutorial', 'Python - Classes/Objects', 'Python - Reg Expressions', 'Python - CGI Programming', 'Python - Database Access', 'Python - Networking', 'Python - Sending Email', 'Python - Multithreading', 'Python - XML Processing', 'Python - GUI Programming', 'Python - Further Extensions', 'Python  Useful Resources', 'Python - Questions and Answers', 'Python - Quick Guide', 'Python - Tools/Utilities', 'Python - Useful Resources', 'Python - Discussion']
36


> ### Processing Unstructured Data

>> ##### Reading Data

- In the Below Example we take a text file and read the file segregating each of the lines in it. Next we can divide the output into further lines and words. The original file is text file containing some paragraphs describing the ython language.

In [187]:
filename = 'input.txt'

with open(filename) as fn:
    # Read Each Line
    read_line = fn.readline()

    print(read_line)    
    # Count of Lines
    # line_count = 1
    # while line_count:
    #     print("Line {} {}".format(line_count, read_line.strip()))
        
    #     read_line = fn.readline()
    #     # line_count += 1 

Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python’s elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.



In [189]:
from collections import Counter

with open('input.txt') as f:
    p =  Counter(f.read().split())
    print(p)

Counter({'and': 11, 'Python': 8, 'the': 6, 'to': 5, 'for': 4, 'is': 3, 'an': 3, 'language': 3, 'in': 3, 'The': 3, 'interpreter': 3, 'It': 2, 'data': 2, 'a': 2, 'but': 2, 'with': 2, 'many': 2, 'are': 2, 'freely': 2, 'or': 2, 'all': 2, 'from': 2, 'be': 2, 'also': 2, 'of': 2, 'as': 2, 'tutorial': 2, 'easy': 1, 'learn,': 1, 'powerful': 1, 'programming': 1, 'language.': 1, 'has': 1, 'efficient': 1, 'high-level': 1, 'structures': 1, 'simple': 1, 'effective': 1, 'approach': 1, 'object-oriented': 1, 'programming.': 1, 'Python’s': 1, 'elegant': 1, 'syntax': 1, 'dynamic': 1, 'typing,': 1, 'together': 1, 'its': 1, 'interpreted': 1, 'nature,': 1, 'make': 1, 'it': 1, 'ideal': 1, 'scripting': 1, 'rapid': 1, 'application': 1, 'development': 1, 'areas': 1, 'on': 1, 'most': 1, 'platforms.': 1, 'extensive': 1, 'standard': 1, 'library': 1, 'available': 1, 'source': 1, 'binary': 1, 'form': 1, 'major': 1, 'platforms': 1, 'web': 1, 'site,': 1, 'https://www.python.org/,': 1, 'may': 1, 'distributed.': 1, 'sam

> ### Word Tokenization

- Work Tokenization is the process of splitting a large sample of text into words. 
- This is a requirement in natural language processing tasks wher each word needs to be captured and subjected to further analysis like classifying ans counting them for a particular sentiment etc.
- The *Natural language Tool Kit(NLTK)* is a llibrary uesd to achives this.
- Install NLTK before procedding with the python program for word tokenizatin.

In [190]:
!pip install nltk



In [198]:
import nltk

word_data = "Python is an easy to learn, powerful programming language. It has efficient high-level data structures and a simple but effective approach to object-oriented programming. Python’s elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms."

# Punkt Sentence Tokenizer this first download in your system
# nltk.download('punkt')

nltk_tokens = nltk.word_tokenize(word_data)
print(nltk_tokens)

['Python', 'is', 'an', 'easy', 'to', 'learn', ',', 'powerful', 'programming', 'language', '.', 'It', 'has', 'efficient', 'high-level', 'data', 'structures', 'and', 'a', 'simple', 'but', 'effective', 'approach', 'to', 'object-oriented', 'programming', '.', 'Python', '’', 's', 'elegant', 'syntax', 'and', 'dynamic', 'typing', ',', 'together', 'with', 'its', 'interpreted', 'nature', ',', 'make', 'it', 'an', 'ideal', 'language', 'for', 'scripting', 'and', 'rapid', 'application', 'development', 'in', 'many', 'areas', 'on', 'most', 'platforms', '.']


>> ##### Tokenizing Sentences

In [195]:
nltk_tokens = nltk.sent_tokenize(word_data)
nltk_tokens

['Python is an easy to learn, powerful programming language.',
 'It has efficient high-level data structures and a simple but effective approach to object-oriented programming.',
 'Python’s elegant syntax and dynamic typing, together with its interpreted nature, make it an ideal language for scripting and rapid application development in many areas on most platforms.']

> ### Stemming and Lemmatization

In [200]:
import nltk
from nltk.stem.porter import PorterStemmer

porter_stremmer = PorterStemmer()

nltk_tokens = nltk.word_tokenize(word_data)

for i in nltk_tokens:
    print("Actual: {} Stem: {}".format(i, porter_stremmer.stem(i)))

Actual: Python Stem: python
Actual: is Stem: is
Actual: an Stem: an
Actual: easy Stem: easi
Actual: to Stem: to
Actual: learn Stem: learn
Actual: , Stem: ,
Actual: powerful Stem: power
Actual: programming Stem: program
Actual: language Stem: languag
Actual: . Stem: .
Actual: It Stem: it
Actual: has Stem: ha
Actual: efficient Stem: effici
Actual: high-level Stem: high-level
Actual: data Stem: data
Actual: structures Stem: structur
Actual: and Stem: and
Actual: a Stem: a
Actual: simple Stem: simpl
Actual: but Stem: but
Actual: effective Stem: effect
Actual: approach Stem: approach
Actual: to Stem: to
Actual: object-oriented Stem: object-ori
Actual: programming Stem: program
Actual: . Stem: .
Actual: Python Stem: python
Actual: ’ Stem: ’
Actual: s Stem: s
Actual: elegant Stem: eleg
Actual: syntax Stem: syntax
Actual: and Stem: and
Actual: dynamic Stem: dynam
Actual: typing Stem: type
Actual: , Stem: ,
Actual: together Stem: togeth
Actual: with Stem: with
Actual: its Stem: it
Actual: inter

In [205]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

nltk_tokens = nltk.word_tokenize(word_data)

# nltk.download('wordnet')
# nltk.download('omw-1.4')

for i  in nltk_tokens:
    print('Actual : {} Stem : {}'.format(i, wordnet_lemmatizer.lemmatize(i)))

Actual : Python Stem : Python
Actual : is Stem : is
Actual : an Stem : an
Actual : easy Stem : easy
Actual : to Stem : to
Actual : learn Stem : learn
Actual : , Stem : ,
Actual : powerful Stem : powerful
Actual : programming Stem : programming
Actual : language Stem : language
Actual : . Stem : .
Actual : It Stem : It
Actual : has Stem : ha
Actual : efficient Stem : efficient
Actual : high-level Stem : high-level
Actual : data Stem : data
Actual : structures Stem : structure
Actual : and Stem : and
Actual : a Stem : a
Actual : simple Stem : simple
Actual : but Stem : but
Actual : effective Stem : effective
Actual : approach Stem : approach
Actual : to Stem : to
Actual : object-oriented Stem : object-oriented
Actual : programming Stem : programming
Actual : . Stem : .
Actual : Python Stem : Python
Actual : ’ Stem : ’
Actual : s Stem : s
Actual : elegant Stem : elegant
Actual : syntax Stem : syntax
Actual : and Stem : and
Actual : dynamic Stem : dynamic
Actual : typing Stem : typing
Actu