# How do I get data into my notebook?

## Setting file path

In [None]:
# Before we start importing, we want to set up our path since it is very long and we don't want to have to 
# type it repeatedly.
from pathlib import Path 

# Specify path, use 'r' before path to specify 'raw' path or Python will misread backslashes.
file_loc = Path(r'C:\Users\hruss\OneDrive\Documents\GMU\Repositories\Data_files')

# Check to see if path is legit
print(file_loc.is_dir())

## CSVs and Excel files

In [None]:
# Importing CSVs and Excel using Pandas
import pandas as pd

In [None]:
# First, we'll import a CSV.
df_csv = pd.read_csv(file_loc/"Movie_Actors.csv") 
df_csv.head()

In [None]:
# Then we'll import an Excel file
df_excel = pd.read_excel(file_loc/"Fortune1000.xlsx")
df_excel.head()

## PDF unstructured text files to wordclouds

In [None]:
# Now let's bring in some text files
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
import PyPDF2 as pyp
import numpy as np
import random

In [None]:
#Open PDFs so they can be read, 'rb' means read only in binary format
Lincoln_pdf = open(file_loc/'Address_Lincoln.pdf', 'rb')
Washington_pdf = open(file_loc/'Address_Washington.pdf', 'rb')
Reagan_pdf = open(file_loc/'Address_Reagan.pdf', 'rb')
JFK_pdf = open(file_loc/'Address_JFK.pdf', 'rb')

In [None]:
JFK_read_pdf = pyp.PdfFileReader(JFK_pdf)
Lincoln_read_pdf = pyp.PdfFileReader(Lincoln_pdf)
Washington_read_pdf = pyp.PdfFileReader(Washington_pdf)
Reagan_read_pdf = pyp.PdfFileReader(Reagan_pdf)
JFK_data = ""
Lincoln_data = ""
Washington_data = ""
Reagan_data = ""

In [None]:
Jnum_pages = JFK_read_pdf.numPages
for i in range(Jnum_pages) : 
        Jpage = JFK_read_pdf.getPage(i) 
        JFK_data = JFK_data + Jpage.extractText()
Lnum_pages = Lincoln_read_pdf.numPages
for i in range(Lnum_pages) : 
        Lpage = Lincoln_read_pdf.getPage(i)
        Lincoln_data = Lincoln_data + Lpage.extractText()
Wnum_pages = Washington_read_pdf.numPages
for i in range(Wnum_pages) : 
        Wpage = Washington_read_pdf.getPage(i)
        Washington_data = Washington_data + Wpage.extractText()
Rnum_pages = Reagan_read_pdf.numPages
for i in range(Rnum_pages) : 
        Rpage = Reagan_read_pdf.getPage(i)
        Reagan_data = Reagan_data + Rpage.extractText()

In [None]:
# Perform NLP on data to create more meaningful word clouds
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [Reagan_data, JFK_data, Washington_data, Lincoln_data]

# Turn words into vectors
vectorizer = TfidfVectorizer(stop_words='english', ngram_range = (1,1), max_df = .6, min_df = .01)
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names_out()
dense = X.todense()
denselist = dense.tolist()

# Turn everything into a dataframe
df = pd.DataFrame(denselist, columns=feature_names)
data = df.transpose()

# Make sure you have these in the same order as your corpus above
data.columns = ['Reagan', 'JFK', 'Washington', 'Lincoln']
data.tail()

In [None]:
# Show wordclouds
colors = "viridis"
maxwords = 50

from matplotlib import pyplot as plt

# Set overall figure size
f = plt.figure(figsize=(12,6))
f.tight_layout()

# Subplot 1
plt.subplot(2, 2, 1)
Washington_wordcloud = WordCloud(max_words = maxwords, colormap = colors).generate_from_frequencies(data['Washington'])
plt.imshow(Washington_wordcloud)
plt.axis('off')
plt.title('Washington Speech', fontsize=15)
# Subplot 2
plt.subplot(2, 2, 2)
Reagan_wordcloud = WordCloud(max_words = maxwords, colormap = colors).generate_from_frequencies(data['Reagan'])
plt.imshow(Reagan_wordcloud)
plt.axis('off')
plt.title('Reagan Speech', fontsize=15)
# Subplot 3
plt.subplot(2, 2, 3)
Lincoln_wordcloud = WordCloud(max_words = maxwords, colormap = colors).generate_from_frequencies(data['Lincoln'])
plt.imshow(Lincoln_wordcloud)
plt.axis('off')
plt.title('Lincoln Speech', fontsize=15)
# Subplot 4
plt.subplot(2, 2, 4)
JFK_wordcloud = WordCloud(max_words = maxwords, colormap = colors).generate_from_frequencies(data['JFK'])
plt.imshow(JFK_wordcloud)
plt.axis('off')
plt.title('JFK Speech', fontsize=15);

## Images from Github repo

In [None]:
#Import libraries
import os 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.image as mpimg 
import PIL
import base64, io, IPython
from PIL import Image as Image
from urllib.request import urlopen

#Pull in files, create folder containing all four files
folder = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Visualization-Workshop/master/Datasets/images/'
files = ['photo-1.jpg', 'photo-2.jpg', 'photo-3.jpg', 'photo-4.jpg']        #How do we automatically iterate over all files in GitHub Folder?
imgs = [Image.open(urlopen(os.path.join(folder, file))) for file in files]

#Now, let's print them out to look at them
fig, axes = plt.subplots(1, 4) 
fig.dpi = 150
labels = ['coast', 'beach', 'building', 'city at night'] 
for i in range(len(imgs)): 
    axes[i].imshow(imgs[i]) 
    axes[i].set_xticks([]) 
    axes[i].set_yticks([]) 
    axes[i].set_xlabel(labels[i], color='black')
imgarr = np.array(imgs[i])


In [None]:
#Let's say we don't want to keep looking for the images on GitHub. We can embed them into our notebook forever...
html = []
for i in range(len(imgs)):
    output = io.BytesIO()
    imgs[i].save(output, format='PNG')
    encoded_string = base64.b64encode(output.getvalue()).decode()
    html.append('<img src="data:image/png;base64,{}"/>'.format(encoded_string))
IPython.display.HTML(html[0])       # Change index from 0 to 1, 2 or 3 to see others

# Special topics in data preprocessing

## Application Programming Interfaces

In [None]:
import requests

# The endpoint URL for joke data API
joke_api_url = f"https://official-joke-api.appspot.com/random_joke"

# Send GET request
joke_response = requests.get(joke_api_url)

# Assign response to a dataset name
joke = joke_response.json()

# Print the setup part of the response
print(joke['setup'])

In [None]:
# Print the punchline part of the response
print(joke['punchline'])

In [None]:
# The endpoint URL for country data API
api_url = f"https://restcountries.com/v3.1/all"

# Send GET request
response = requests.get(api_url)

data = response.json()

# Let's look at one record in the response
data[0]


In [None]:
countries_df = pd.DataFrame(columns = ['name', 'UN Member', 'Region', 'Lat-Long', 'Population', 'Driving side', 'Start of week'] )

for i in data:
    countries = \
    [
        i['name']['common'],
        i['unMember'],
    #    i['capital'],
        i['region'],
        i['latlng'],
        i['population'],
    #    i['gini'],
        i['car']['side'],
        i['startOfWeek']
    ]
    countries_df.loc[len(countries_df)] = countries

countries_df

## Text processing / classification

In [None]:
import joblib
import re
import string

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
categories = [
    "alt.atheism",
    "misc.forsale",
    "sci.space",
    "soc.religion.christian",
    "talk.politics.guns",
]

news_group_data = fetch_20newsgroups(
    subset="all", remove=("headers", "footers", "quotes"), categories=categories
)

df = pd.DataFrame(
    dict(
        text=news_group_data["data"],
        target=news_group_data["target"]
    )
)
df["target"] = df.target.map(lambda x: categories[x])

In [None]:
df

In [None]:
def process_text(text):
    text = str(text).lower()
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", " ", text
    )
    text = " ".join(text.split())
    return text

df["clean_text"] = df.text.map(process_text)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.20, stratify=df.target)

In [None]:
vec = CountVectorizer(
    ngram_range=(1, 3), 
    stop_words="english",
)

X_train = vec.fit_transform(df_train.clean_text)
X_test = vec.transform(df_test.clean_text)

y_train = df_train.target
y_test = df_test.target

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

preds = nb.predict(X_test)
print(classification_report(y_test, preds))

In [None]:
joblib.dump(nb, "nb.joblib")
joblib.dump(vec, "vec.joblib")

In [None]:
nb_saved = joblib.load("nb.joblib")
vec_saved = joblib.load("vec.joblib")

sample_text = ["Space, Stars, Planets and Astronomy!"]
# Process the text in the same way you did when you trained it!
clean_sample_text = process_text(sample_text)
sample_vec = vec_saved.transform(sample_text)
nb_saved.predict(sample_vec)

## Image processing / classification

In [None]:
# The following code comes from here: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html 

import torch
import torchvision
import torchvision.transforms as transforms

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)

In [None]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
batch_size = 4

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# get some random training images
dataiter = iter(trainloader)
images, labels = next(dataiter)

In [None]:
img = torchvision.utils.make_grid(images)
npimg = img.numpy() # normalized image
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))

In [None]:
img = torchvision.utils.make_grid(images)
img = img / 2 # unnormalize to make image easier to view
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))

In [None]:
img = torchvision.utils.make_grid(images)
img = img / 2 + 0.25 # unnormalize to make image easier to view
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))

In [None]:
img = torchvision.utils.make_grid(images)
img = img / 2 + 0.5 # unnormalize to make image easier to view
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))

In [None]:
img = torchvision.utils.make_grid(images)
img = img / 2 + 0.75 # unnormalize to make image easier to view
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))

In [None]:
img = torchvision.utils.make_grid(images)
img = img / 3 + 0.75 # unnormalize to make image easier to view
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()

In [None]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

In [None]:
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)

In [None]:
dataiter = iter(testloader)
images, labels = next(dataiter)

# print images
img = torchvision.utils.make_grid(images)
img = img / 2 + 0.35 # unnormalize to make image easier to view
npimg = img.numpy()
plt.imshow(np.transpose(npimg, (1, 2, 0)))
plt.show()
print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' for j in range(4)))

In [None]:
net = Net()
net.load_state_dict(torch.load(PATH))

In [None]:
_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}'
                              for j in range(4)))

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in testloader:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = net(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

In [None]:
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')

In [2]:
! pip freeze

absl-py==1.3.0
affine==2.3.1
aiohttp==3.8.3
aiosignal==1.3.1
ale-py==0.8.0
altair==4.2.0
ansi2html==1.8.0
anyascii==0.3.2
anyio==3.6.2
appdirs==1.4.4
apptools==5.2.0
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
artist==0.18.2
asttokens==2.0.8
astunparse==1.6.3
async-timeout==4.0.2
attrs==22.1.0
audioread==3.0.0
autobahn==22.12.1
backcall==0.2.0
beautifulsoup4==4.11.1
binaryornot==0.4.4
bleach==5.0.1
blinker==1.5
blis==0.7.9
bokeh==2.4.3
branca==0.6.0
Brotli==1.0.9
cachetools==5.2.0
catalogue==2.0.8
celluloid==0.2.0
certifi==2022.9.24
cffi==1.15.1
chardet==5.0.0
charset-normalizer==2.1.1
chart-studio==1.1.0
click==8.1.3
click-default-group==1.2.2
click-plugins==1.1.1
cligj==0.7.2
cloudpickle==2.2.0
cloup==0.13.1
clustergram==0.6.0
clusteval==2.1.4
cmake==3.24.1.1
cmdstanpy==1.2.0
cmp==0.0.1
cmyt==1.1.3
codecarbon==2.3.1
colorama==0.4.6
colorcet==3.0.1
colorlover==0.3.0
colorspacious==1.1.2
colour==0.1.5
colourmap==1.1.9
commonmark==0.9.1
confection==0.0.4
ConfigArgParse

