# Data Collection

In [None]:
!pip install beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
URL = "https://csc380.beingenfa.com/Syllabus/Key_Info.html"

In [None]:
r = requests.get(URL)

In [None]:
r.status_code

In [None]:
r.text

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
print(soup.prettify())

In [None]:
print(soup.get_text())

In [None]:
# creating a list of all common heading tags
heading_tags = ["h1", "h2", "h3"]
for tags in soup.find_all(heading_tags):
    print(tags.text.strip())

# Data Processing

## Introduction to a few Libraries

## 2.1.1 Numpy

Support for large, multi-dimensional arrays and matrices, and a large collection of high-level mathematical functions to operate on these arrays.

In [None]:
import numpy as np

ndarray object: an n-dimensional array of **homogeneous** data types, with many operations being performed in
compiled code for performance

* Fixed Size
* Same type of data
* Much more effiecent mathematical operations than built in data types like lists.

numpy.dtype 
- intc (same as a C integer) and intp (used for indexing)
- int8, int16, int32, int64
- uint8, uint16, uint32, uint64
- float16, float32, float64
- complex64, complex128

## Numpy Arrays

### Create a numpy array

- Conversion from other Python structures (e.g., lists, tuples)
- Built-in NumPy array creation (e.g., arange, ones, zeros, etc.)
- Reading arrays from a file.

In [None]:
np.array([2,3,1,0])

In [None]:
np.zeros((5, 5)) #np.zeros(shape)

In [None]:
np.ones((6, 2))#np.ones(shape)

In [None]:
np.arange(15,5,-1) #Like range function in python

In [None]:
#Return evenly spaced numbers over a specified interval.
np.linspace(0, 100, 5) # numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0)

In [None]:
np.random.random() #Keeps changing

In [None]:
random_obj = np.random.default_rng(seed=None) #default_rng is the recommended constructor for the random number class
random_obj.random() #changes if you do not give a seed

In [None]:
random_obj = np.random.default_rng(seed=42) #default_rng is the recommended constructor for the random number class
random_obj.random()

In [None]:
print("Original:\n",np.arange(9))
print("After using reshape:\n",np.arange(9).reshape(3,3))

In [None]:
x = np.arange(2,10)
print(x)
x[-1]

In [None]:
x.shape = (1,3)
print(x)
x[-1] # next slide

In [None]:
x.shape = (2,4)
print("Array:\n",x,"\n")
print("x[-1]: ",x[0])
print("x[1,3]: ", x[0,3])

In [None]:
a = np.arange(1,11)
b = np.arange(12,22)
a+b

In [None]:
a = np.arange(1,11).reshape(2,5)
b = np.arange(12,22).reshape(5,2)
result = np.dot(a,b) # To multiply two arrays
result

In [None]:
result.transpose()

In [None]:
np.linalg.inv(result) # and finally

# Scipy

- built on the NumPy 
- various tools and functions for solving common problems in scientific computing.

ex: 
- Fourier Transforms (scipy.fftpack)
- Multidimensional image processing (scipy.ndimage)
- Spatial data structures and algorithms (scipy.spatial)
 ..


## Continue our discussion on Pandas

In [1]:
import pandas as pd

In [2]:
WORLD_DATA_PATH = "spotify-top-50/data/spotify-streaming-top-50-usa.csv"

In [3]:
world_df = pd.read_csv(WORLD_DATA_PATH)

In [4]:
world_df.sample()

Unnamed: 0,date,position,song,artist,popularity,duration_ms,album_type,total_tracks,release_date,is_explicit,album_cover_url
1983,2023-06-26,34,Karma,Taylor Swift,90,204852,album,13,2022-10-21,True,https://i.scdn.co/image/ab67616d0000b273bb54dd...


### Q: The time range of the dataset?

In [7]:
world_df['date'].dtype

dtype('O')

In [9]:
type(world_df['date'][0])

str

In [12]:
# Convert column date to date datatype

world_df['date'] = pd.to_datetime(world_df['date'])

In [16]:
# Q : What is the time range in which this dataset is recording top 50?
# Assume that it records everyday

world_df['date'].max(), world_df['date'].min()

(Timestamp('2023-06-27 00:00:00'), Timestamp('2023-05-18 00:00:00'))

In [None]:
#Find start and end date of the dataset

print("Dataset start date:",world_df['date'].min())
print("Dataset end date:",world_df['date'].max())

In [None]:
#Revsiting groupby
world_df.groupby("position")

In [None]:
#Grouping by position, see in each position what was the maximum duration of a song
world_df.groupby("position")["duration_ms"].max().to_frame()

## GroupBy

<img src = "https://pbs.twimg.com/media/CycthXVXgAAazkz?format=jpg&name=small">

source = "https://www.kaggle.com/code/alenavorushilova/grouping-sorting-and-filtering-data-tutorial"

In [None]:
# Q: For each position (1 to 50), which artist was in that rank the maximum number of days


In [None]:
df_with_rank_counts = world_df.groupby("position")["artist"].value_counts().to_frame()
df_with_rank_counts

In [None]:
#Multiindex 
df_with_rank_counts.index

In [None]:
#Getting index values in multi-index
df_with_rank_counts["rank_no"] = df_with_rank_counts.index.get_level_values("position")

In [None]:
df_with_rank_counts.sample()

In [None]:
#Getting index values in multi-index
df_with_rank_counts["artist"] = df_with_rank_counts.index.get_level_values("artist")

In [None]:
df_with_rank_counts.sample()

In [None]:
# Another way
df_with_rank_counts = world_df.groupby("position")["artist"].value_counts().to_frame()
df_with_rank_counts.sample()

In [None]:
df_with_rank_counts.reset_index(level = 0) 

In [None]:
df_with_rank_counts = world_df.groupby("position")["artist"].value_counts().to_frame()
df_with_rank_counts["count"] = df_with_rank_counts["artist"]
df_with_rank_counts.drop("artist")

In [None]:



df_with_rank_counts = world_df.groupby("position")["artist"].value_counts().to_frame()
df_with_rank_counts["count"] = df_with_rank_counts["artist"]
df_with_rank_counts.rename(columns={"artist":"creator"},inplace = True)
df_with_rank_counts


In [None]:
df_with_rank_counts.reset_index(inplace=True)
df_with_rank_counts

In [None]:



df_with_rank_counts[df_with_rank_counts["position"]== 1]["count"].idxmax() #Return index 

In [None]:
df_with_rank_counts.loc[df_with_rank_counts[df_with_rank_counts["position"]== 1]["count"].idxmax()].to_frame().T

In [None]:
list_of_dataframes = []
for position in range(1,51):
    list_of_dataframes.append(df_with_rank_counts.loc[df_with_rank_counts[df_with_rank_counts["position"]== position]["count"].idxmax()].to_frame().T)

In [None]:
list_of_dataframes[1]

In [None]:
merged_df = pd.concat(list_of_dataframes)
merged_df

In [None]:
merged_df.set_index("position",drop=True,inplace=True)
merged_df

In [None]:
del merged_df["creator"]

In [None]:
merged_df