<a href="https://colab.research.google.com/github/Bo-fromLA/Bo-fromLA/blob/main/Regex_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load data
data = pd.read_csv('california_housing.csv')

# Feature Engineering
data['population_density'] = data['population'] / data['households']
data['rooms_per_household'] = data['total_rooms'] / data['households']
data['bedrooms_per_room'] = data['total_bedrooms'] / data['total_rooms']

# Split data
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check VIF and handle multicollinearity
def calculate_vif(df):
    vif = pd.DataFrame()
    vif['variables'] = df.columns
    vif['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif

vif = calculate_vif(X_train)
print(vif)

# If VIF is high, consider dropping or combining features

# Pipeline with LASSO for feature selection
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso(alpha=0.1))
])

# Fit model
pipeline.fit(X_train, y_train)

# Predictions and evaluation
predictions = pipeline.predict(X_test)
print('R2 Score:', r2_score(y_test, predictions))
print('MAE:', mean_absolute_error(y_test, predictions))


In [None]:
import pandas as pd
import numpy as np

np.random.seed(1)

data=pd.read_csv("/content/sample_data/mnist_train_small.csv")
data.head()

Unnamed: 0,6,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.581,0.582,0.583,0.584,0.585,0.586,0.587,0.588,0.589,0.590
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data.shape

(19999, 785)

In [None]:
total=np.prod(data.shape)
total

15699215

In [None]:
(data.isna().sum()).sum()

0

In [None]:
import re


In [None]:
text = 'This is a good day.'

In [None]:
if re.search('good', text):
  print('wonderful')
else:
  print('Alas :(')


wonderful


In [None]:
text = 'Amy works diligently. Amy gets good grades. Our student Amy is successful.'


In [None]:
re.split('Amy', text)

['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is successful.']

In [None]:
re.findall("Amy", text)

['Amy', 'Amy', 'Amy']

In [None]:
text = 'Amy works diligently. Amy gets good grades. Our student Amy is successful.'
re.search('^Amy', text)

<re.Match object; span=(0, 3), match='Amy'>

In [None]:
grades = 'ACAAABCBBCBAA'
re.findall('B', grades)

['B', 'B', 'B', 'B']

In [None]:
re.findall('[AB]', grades)

['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'A', 'A']

In [None]:
re.findall('[A][B-C]', grades)

['AC', 'AB']

In [None]:
re.findall('AB|AC', grades)

['AC', 'AB']

In [None]:
re.findall('[^A]',grades)

['C', 'B', 'C', 'B', 'B', 'C', 'B']

In [None]:
re.findall('^[^A]', grades)

[]

## Quantifiers

In [None]:
re.findall('A{2,10}', grades)

['AAA', 'AA']

# New Section

In [None]:
re.findall('A{1,1}A{1,1}', grades)

['AA', 'AA']

In [None]:
# it is a mistake to put spaces in the above syntax - -it returns an empty list

re.findall('A{1, 1}', grades)

[]

In [None]:
re.findall('AA', grades)

['AA', 'AA']

In [None]:
re.findall('A{2}', grades)

['AA', 'AA']

In [None]:
re.findall('A{1,10}B{1,10}C{1,10}', grades)

['AAABC']

In [None]:
#  three other quantifiers that are usedd s shorthand:
# asterisk * to match 0 or more times;
# a question mark ? to match oone or more times
# a + to match one or more times

## Scraping from Wikipedia

In [None]:
import os
import requests
from bs4 import BeautifulSoup

In [None]:
from google.colab import files
uploaded = files.upload()

Saving ferpa.txt to ferpa.txt


In [None]:
url = "Course - 1: Introduction to Data Science in Python/resources/week-1/datasets/ferpa.txt"

In [None]:
with open('/content/ferpa.txt', 'r') as file:
  wiki=file.read()
wiki

'Overview[edit]\nFERPA gives parents access to their child\'s education records, an opportunity to seek to have the records amended, and some control over the disclosure of information from the records. With several exceptions, schools must have a student\'s consent prior to the disclosure of education records after that student is 18 years old. The law applies only to educational agencies and institutions that receive funds under a program administered by the U.S. Department of Education.\n\nOther regulations under this act, effective starting January 3, 2012, allow for greater disclosures of personal and directory student identifying information and regulate student IDs and e-mail addresses.[2] For example, schools may provide external companies with a student\'s personally identifiable information without the student\'s consent.[2]\n\nExamples of situations affected by FERPA include school employees divulging information to anyone other than the student about the student\'s grades o

In [None]:
re.findall('[a-zA-Z]{1,100}\[edit\]', wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [None]:
# above xample didn't quite work
# we can use \w to match any letter, including digits and numbers. \w is a metacharacter

re.findall('[\w]{1,1000}\[edit\]', wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [None]:
re.findall('[\w]*\[edit\]', wiki)

['Overview[edit]', 'records[edit]', 'records[edit]']

In [None]:
#noow that we have shortened regex, let's improve it. we can addin a space
re.findall('[\w ]*\[edit\]', wiki)
#this get's us section titles in the wikipedia page

['Overview[edit]',
 'Access to public records[edit]',
 'Student medical records[edit]']

In [None]:
# you can noow iterate through titles bby applying another regex

for title in re.findall('[\w ]*\[edit\]', wiki):
  # now we'll take the intermediate result and split on [] and just take the first result
  print(re.split("[\[]", title)[0])


Overview
Access to public records
Student medical records


In [None]:
# this works but it's a pain.
# so far we've been talking about the regex as a single pattern which is matched.
# but you can match different patterns called GROUPS at the same time
# then refer to these groups later as you want to
# use () to group patternt together

re.findall('([\w ]*)(\[edit\])', wiki)

# here python re module breaks out the result by group.


[('Overview', '[edit]'),
 ('Access to public records', '[edit]'),
 ('Student medical records', '[edit]')]

In [None]:
 # we can actually refer to groups by number wiht the match object that are returned
 # but how do we get bback the list of match objects?

# use finditer()

for item in re.finditer('([\w ]*)(\[edit\])', wiki):
  print(item.groups())

('Overview', '[edit]')
('Access to public records', '[edit]')
('Student medical records', '[edit]')


In [None]:
# we see here that the grouops() method returns a tuple of the group
# use group(0) foor the whole match and the other number is the portion of the match we are interested in

for item in re.finditer('([\w ]*)(\[edit\])', wiki):
  print(item.group(1))

Overview
Access to public records
Student medical records


In [None]:
# we can also name or label groups
# in the previoous example we used positioning
# now we try giving them a labbel and looking at the result as a dictionary

# () - starts the group
# ?P  - indicates an extension to basic regexes
# <name> - the dictionary key we want to ouse wrapped in <>

for item in re.finditer('(?P<title>[\w ]*)(?P<edit_link>\[edit\])', wiki):
  # we can get dictionary return item using the .groupdict()
  print(item.groupdict()['title'])

Overview
Access to public records
Student medical records


In [None]:
# we can print out the whole dictionary for the item too, and see
# that the [edit] string is still in there. here's the dict kept for the laast match

print(item.groupdict())

{'title': 'Student medical records', 'edit_link': '[edit]'}


In [None]:
# there are a number of shorthands, look into python documentation
# a . for any single character which is not a new line
# a \s for whitesapce, like spaces or tabs
 # a \d for any digit

# Look-ahead and Look- behind

In [None]:
# the pattern being given to the regex engine is for text either before or after the text
# we are actually trying to isolate

# ?= syntax looks ahead (foor example, before the [edit] that we have bbeen throwing away so far)

for item in re.finditer('(?P<title>[\w ]+)(?=\[edit\])', wiki):
  # this tells regex to match 2 groups.
  # 1st named title and will have any amount of whitespace oor regualr woord characters
  # 2nd the characers [edit], but we DON'T want to put it in our output
  print(item)

<re.Match object; span=(0, 8), match='Overview'>
<re.Match object; span=(2715, 2739), match='Access to public records'>
<re.Match object; span=(3692, 3715), match='Student medical records'>


# Example: Wikipedia Data

In [None]:
!wget https://github.com/yonycherkos/Applied-Data-Science-with-Python-Specialization/blob/main/Course%20-%201:%20Introduction%20to%20Data%20Science%20in%20Python/resources/week-1/datasets/buddhist.txt
with open('buddhist.txt', 'r') as file:
  wiki=file.read()
wiki

--2024-06-27 01:29:03--  https://github.com/yonycherkos/Applied-Data-Science-with-Python-Specialization/blob/main/Course%20-%201:%20Introduction%20to%20Data%20Science%20in%20Python/resources/week-1/datasets/buddhist.txt
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘buddhist.txt.2’

buddhist.txt.2          [ <=>                ] 314.72K  --.-KB/s    in 0.1s    

2024-06-27 01:29:04 (2.41 MB/s) - ‘buddhist.txt.2’ saved [322273]





In [None]:
# print(wiki)








<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-theme="dark"
  data-a11y-animated-images="system" data-a11y-link-underlines="true"
  >



  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">
  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>
  <link rel="preconnect" href="https://avatars.githubusercontent.com">

  


  <link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/light-efd2f2257c96.css" /><link crossorigin="anonymous" media="all" rel="stylesheet" href="https://github.githubassets.com/assets/dark-6b1e37da2254.css" /><link data-color-theme="dark_dimmed" crossorigin="anonymous" medi

In [None]:
# from google.colab import files

# uploaded = files.upload()
# filename = list(uploaded.keys())[0]

# with open(filename, 'r') as file:
#     content = file.read()
#     print(content)

In [None]:
uploaded = files.upload()
wiki = list(uploaded.keys())[0]
wiki

Saving buddhist.txt to buddhist (2).txt


'buddhist (2).txt'

In [None]:
with open(wiki, 'r') as file:
  wiki=file.read()
wiki

'Buddhist universities and colleges in the United States\nFrom Wikipedia, the free encyclopedia\nJump to navigationJump to search\n\nThis article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed.\nFind sources: "Buddhist universities and colleges in the United States" – news · newspapers · books · scholar · JSTOR (December 2009) (Learn how and when to remove this template message)\nThere are several Buddhist universities in the United States. Some of these have existed for decades and are accredited. Others are relatively new and are either in the process of being accredited or else have no formal accreditation. The list includes:\n\nDhammakaya Open University – located in Azusa, California, part of the Thai Wat Phra Dhammakaya[1]\nDharmakirti College – located in Tucson, Arizona Now called Awam Tibetan Buddhist Institute (http://awaminstitute.org/)\nDharma Realm Buddh

In [None]:
# the verbose mode of pyhton regex alows you to write multi-line regexes and increases readability.
# for this mode we hvae to explicitly indicate all whitesapce characers by prepensing them with
# a \ <--backslash
# or by using the special value \s
# however this allows us to write regexes as codes and even incude comments with # sign

pattern = """
(?P<title>.*)         # the university title
(-\ located\ in\ )    # an indicator or the location
(?P<city>\w*)         # city the university is in (any number of word characters)
(,\ )                 # separator for the state
(?P<state>\w*)        # the state the city is located in
"""

# Now with finditer() we just pass the re.VERBOSE flag as the last parameter, this makes it
# much easier to understand large regexes!

for item in re.finditer(pattern, wiki, re.VERBOSE):
  # we can get dictionary returned
  print(item.groupdict())

In [None]:
pattern = r"""
(?P<title>.*?)          # the university title (non-greedy)
\s*–\s*located\s+in\s+  # indicator for the location (allows for various spacing)
(?P<city>[\w\s]+?)      # city the university is in (non-greedy, allowing spaces)
,\s+                    # separator for the state
(?P<state>\w+)          # the state the city is located in
"""


# Now with finditer() we just pass the re.VERBOSE flag as the last parameter, this makes it
# much easier to understand large regexes!

for item in re.finditer(pattern, wiki, re.VERBOSE):
  # we can get dictionary returned
  print(item.groupdict())

{'title': 'Dhammakaya Open University', 'city': 'Azusa', 'state': 'California'}
{'title': 'Dharmakirti College', 'city': 'Tucson', 'state': 'Arizona'}
{'title': 'Dharma Realm Buddhist University', 'city': 'Ukiah', 'state': 'California'}
{'title': 'Ewam Buddhist Institute', 'city': 'Arlee', 'state': 'Montana'}
{'title': 'Institute of Buddhist Studies', 'city': 'Berkeley', 'state': 'California'}
{'title': 'Maitripa College', 'city': 'Portland', 'state': 'Oregon'}
{'title': 'Soka University of America', 'city': 'Aliso Viejo', 'state': 'California'}
{'title': 'University of the West', 'city': 'Rosemead', 'state': 'California'}
{'title': 'Won Institute of Graduate Studies', 'city': 'Glenside', 'state': 'Pennsylvania'}


# Example: New York Times and Hashtags

In [None]:
with open ('', 'r') as file:
  health=file.read()
health

In [None]:
# let's create a pattern.
# first we ant to includet the hash sign, then any alphanumeric characters.
# and we end when we see white space

pattern = '#[\w\d]*(?=\s)'
re.findall(pattern, health)