# Regular expressions

In [17]:
import re
import pandas as pd

In [18]:
text = 'Around 2500 patients are taking part in clinical trails #Coronavirus'

In [25]:
text='Sachin is an all rounder in cricket world'

In [26]:
re.findall('[a-zA-Z]+',text)

['Sachin', 'is', 'an', 'all', 'rounder', 'in', 'cricket', 'world']

In [23]:
print(text)
re.findall('[a-z]+', text) # Find all sequency of lower case characters

Around 2500 patients are taking part in clinical trails #Coronavirus


['round',
 'patients',
 'are',
 'taking',
 'part',
 'in',
 'clinical',
 'trails',
 'oronavirus']

In [4]:
print(text)
re.findall('[a-zA-Z]+', text) # Find all sequency of lower & upper case characters

Around 2500 patients are taking part in clinical trails #Coronavirus


['Around',
 'patients',
 'are',
 'taking',
 'part',
 'in',
 'clinical',
 'trails',
 'Coronavirus']

 Extended Regular Expressions:
 
     \d -> Any digit, equivalent to [0-9]
     \D -> Any non-digit, equivalent to [^0-9]
    \w -> Any alphanumeric, equivalent to [a-zA-Z0-9_]
    \W -> Non-alphanumeric, equivalent to [^a-zA-Z0-9_]
    \s -> Any whitespace character
    \S -> Any nonwhitespace character
    
    () -> Scoping for extraction
    {} -> Frequency for extraction
    ? -> Make a pattern non greedy

In [5]:
print(text)
re.findall('\w+', text) # Find all sequency of word characters[a-zA-Z0-9_]

Around 2500 patients are taking part in clinical trails #Coronavirus


['Around',
 '2500',
 'patients',
 'are',
 'taking',
 'part',
 'in',
 'clinical',
 'trails',
 'Coronavirus']

In [61]:
text = "The film Titanic was released in year 98 and was a hit till the year 2000 \n5000 was the cost of the mobile\ni baragined it to"

In [62]:
print(text)

The film Titanic was released in year 98 and was a hit till the year 2000 
5000 was the cost of the mobile
i baragined it to


In [71]:
for line in text.split("\n"):
    patterns = re.findall("\d+",line)
    if len(patterns)>0:
        print(patterns)

['98', '2000']
['5000']


In [70]:
for line in text.split("\n"):
    line = line.strip()
    find = re.findall("\d{4}\$",line)
    if len(find)>0: 
        print(find)

In [58]:
text = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'

In [35]:
re.findall("\S+@\S+", text)

['csev@umich.edu', 'cwen@iupui.edu']

In [12]:
re.findall("\s+@\S+", text)

[' @2PM']

In [13]:
re.findall("\S+@(\S+)", text)

['umich.edu', 'iupui.edu']

In [None]:
re.findall("(\S+)@\S+", text)

['csev', 'cwen']

#### Cleaning text using re.sub

In [45]:
text = 'Around 2,500 patients are taking part in ** clinical trails #Coronavirus'

In [46]:
print(text)
re.sub('[^\w+]', '', text)

Around 2,500 patients are taking part in ** clinical trails #Coronavirus


'Around2500patientsaretakingpartinclinicaltrailsCoronavirus'

In [52]:
print(text)
ss ='this is my string & && (* remove'
re.sub('[^\w+\s]', '', ss)

Around 2,500 patients are taking part in ** clinical trails #Coronavirus


'this is my string    remove'

In [53]:
text = "film ABC  @ was ? produced %  in , year $ 1994  .  'by'   Mr_X"

In [54]:
#Removing special charecters with nothing
result1 = re.sub("[,@'?.$%_]", "", text)
result1

'film ABC   was  produced   in  year  1994    by   MrX'

In [55]:
#Removing special charecters(non Alpha numeric and Space) with nothing
result1 = re.sub("[^a-zA-Z0-9 ]","",text)
result1

'film ABC   was  produced   in  year  1994    by   MrX'

In [56]:
# \W -> Alphanumeric with underscores
#\s -> Space
result1 = re.sub("[^\w\s]","",text)
result1

'film ABC   was  produced   in  year  1994    by   Mr_X'

In [None]:
#Removing multiple spaces with a single space
result = re.sub("\s+", " ", result1)
result

'film ABC was produced in year 1994 by Mr_X'

### Extracting text from HTML tags

In [72]:
text = """<div>
<h1> H2O</h1>
<p> AutoML</p>
<a href="https://www.amazon.ai/products/h2o-driverless-ai/"> Driverless AI</a>
</div>"""

In [73]:
print(text)

<div>
<h1> H2O</h1>
<p> AutoML</p>
<a href="https://www.amazon.ai/products/h2o-driverless-ai/"> Driverless AI</a>
</div>


To Extract Text Between the HTML tages we need to remove all the text between < and >
i.e remove  Zero or more occourrence of any chatecter between < and >

In [75]:
print(re.sub('<.*?>',"",text))


 H2O
 AutoML
 Driverless AI



In [None]:
#.* and + are greedy - it matches everything including the closing angular bracket >
# .* and .+ should stop matching a pattern as soon as it encounters the closing angular bracket
# use a "?" along with .* and .+ to make it non greedy

In [None]:
print(re.sub('<.*?>',"",text))


 H2O
 AutoML
 Driverless AI



## Extracting hashtags from tweets

In [None]:
text = 'Around 2,500 patients are taking part in clinical trails #Coronavirus'
print(text)
print(re.findall('#\w+', text))

Around 2,500 patients are taking part in clinical trails #Coronavirus
['#Coronavirus']


In [None]:
tweets = pd.read_csv('C:/Users/Raghavendra N/OneDrive/Official/Datasets/tweets_donald_trump.csv')
tweets.head()

Unnamed: 0,created_at,language,likes,retweets,text
0,2020-06-17 03:27:56,en,123212.0,18568.0,96% Approval Rating in the Republican Party. T...
1,2020-06-17 02:45:33,und,0.0,7942.0,RT @TONYxTWO: @thejtlewis @JoeBiden https://t....
2,2020-06-17 02:38:20,en,0.0,23815.0,RT @thejtlewis: “Trump isn’t going to accept t...
3,2020-06-17 02:37:01,en,0.0,6781.0,"RT @thejtlewis: With the utmost respect, I tha..."
4,2020-06-17 02:31:11,en,56840.0,14231.0,A GREAT woman. Her son is looking down from he...


Obtain the frequency of each of the hashtags

- Step1: Extract all the hash tags and store them in a list
- Step2: Compute the frequency of each of the hashtags

## Cleaning salary

In [None]:
jobs = pd.read_csv('C:/Users/Raghavendra N/OneDrive/Official/Datasets/datascience_jobs.csv')
jobs.head(5)

Unnamed: 0,title,location,experience,skills,company,salary,description,posted_date
0,Data Science,Mumbai,2-4 yrs,"Algorithms, Machine Learning, Python, Java, Da...",Netcore Solutions Pvt Ltd,"2,00,000 - 7,00,000 P.A.",At least 2 year of experience in data engineer...,1 day ago
1,Analyst / Sr. Analyst (data Science),Gurgaon,5-8 yrs,"predictive modeling, predictive analytics, mac...",Cvent India Pvt. Ltd.,"5,00,000 - 10,00,000 P.A.",Strong experience on providing predictive mode...,Today
2,ETL Lead & Data Science,"Chennai, Bengaluru, Mumbai, Pune, Noida",7-10 yrs,"SQL, Data Analysis, Text Mining, SAS, R, Stati...",COMPUTER POWER GROUP PRIVATE LIMITED,"10,00,000 - 15,00,000 P.A.",Industry experience in building and operationa...,1 day ago
3,Specialist - Data Science,"Delhi NCR, Bengaluru, Gurgaon",7-12 yrs,"Specialist - Data Science, Data Science, data ...",Brainsearch Consulting Pvt Ltd.Â,Not disclosed,- Experience with one or more data science pro...,1 day ago
4,Group Manager - Data Science - Python/nlp,Bengaluru,6-11 yrs,"machine learning, text mining, r, nlp, data sc...",Staffio HR,Not disclosed,- This is a Team management role - Skill set ...,1 day ago


In [None]:
# Task : From the salary column extract the minimum and maximum salary, NA if unable to extract

In [81]:
strings =' 66 .6 anil'
re.sub('[^\d+\.]','',strings)

'66.6'