## Http using Python vs Browser

In [9]:
import requests
r = requests.get('https://github.com/splicemachine/pysplice/blob/master/README.md')
print(r.status_code)
print(r.text)

200






<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="utf-8">
  <link rel="dns-prefetch" href="https://github.githubassets.com">
  <link rel="dns-prefetch" href="https://avatars0.githubusercontent.com">
  <link rel="dns-prefetch" href="https://avatars1.githubusercontent.com">
  <link rel="dns-prefetch" href="https://avatars2.githubusercontent.com">
  <link rel="dns-prefetch" href="https://avatars3.githubusercontent.com">
  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">
  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">



  <link crossorigin="anonymous" media="all" integrity="sha512-BSy+E+S5PJuDWKcXiIXBoFJ7uJ+88y6hFdIhZpf7nf9MVNVvnJDPUaotaxFUQi8UXCLJOcGv1uifxVMc9o5DYQ==" rel="stylesheet" href="https://github.githubassets.com/assets/frameworks-052cbe13e4b93c9b8358a7178885c1a0.css" />
  <link crossorigin="anonymous" media="all" integrity="sha512-aQX2djfPBL4OvvrSedVieBg1MykiSlcGcyx8r0NNVSbPQr2bjw4KhbEdz+MoACuZ3IXmu3z3qzou

In [11]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

print(soup.find_all('code'))

[<code>(sudo) pip install git+https://github.com/splicemachine/pysplice</code>, <code>splicemachine.spark.context</code>, <code>splicemachine.mlflow_support</code>, <code>splicemachine.stats</code>, <code>splicemachine.notebook</code>, <code>splicemachine.spark.context</code>, <code>class PySpliceContext(builtins.object)
 |  PySpliceContext(sparkSession, JDBC_URL=None, _unit_testing=False)
 |  
 |  This class implements a SpliceMachineContext object (similar to the SparkContext object)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sparkSession, JDBC_URL=None, _unit_testing=False)
 |      :param JDBC_URL: (string) The JDBC URL Connection String for your Splice Machine Cluster
 |      :param sparkSession: (sparkContext) A SparkSession object for talking to Spark
 |  
 |  analyzeSchema(self, schema_name)
 |      analyze the schema
 |      :param schema_name: schema name which stats info will be collected
 |      :return:
 |  
 |  analyzeTable(self, schema_table_name, estimateStat

## Jupyter defaults to port 8888, but can be changed
### You can't access an IP unless it's port is made available

<code>jupyter notebook </code> <-- 8888<br>
<code>jupyter notebook --port=8889</code> <-- 8889

## SSH and SCP

In [2]:
!pip install -q scp
!pip install -q paramiko



In [9]:
%%writefile test.txt

my test file2

Overwriting test.txt


## Python equivalent of 
<code>scp -i $ex_pem test.txt $ex_host:/home/ubuntu/test.txt</code>

### Where
* ex_pem is your PEM file
* ex_host is your host IP address

[src](https://gist.github.com/batok/2352501)

In [8]:
from paramiko import SSHClient, RSAKey, AutoAddPolicy
from scp import SCPClient

# SSH Client
ssh = SSHClient()

# EC2 Host and Private Key
k = RSAKey.from_private_key_file(open('pem_loc.txt').read().strip())
host = open('ec2_host.txt').read().strip()

# Add new host if unknown
ssh.set_missing_host_key_policy(AutoAddPolicy())

ssh.connect(host,username='ubuntu',pkey=k)

# SCPCLient takes a paramiko transport as an argument
scp = SCPClient(ssh.get_transport())

# Uploading the 'test' directory with its content in the
# home (~) remote directory
scp.put('test.txt', recursive=True, remote_path='~')

scp.close()

## Boto3 and AWS S3

[src](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html)

In [21]:
!pip install -q boto3

In [1]:
import boto3

s3 = boto3.client('s3')
s3.download_file('splice-demo', 'customers-4000.csv', 'customers.csv')


## Read directly with Pandas (assuming it's public)

### For private files you'll need to provide credentials

In [2]:
!pip install -q s3fs

In [16]:
import pandas as pd
df = pd.read_csv('s3://splice-demo/customers-4000.csv', header=None, names=['ID', 'First', 'Last', "Full"])
df

Unnamed: 0,ID,First,Last,Full
0,1,Eric,Nash,Eric Nash
1,2,Owen,Clarkson,Owen Clarkson
2,3,Simon,Stewart,Simon Stewart
3,4,Tim,Paterson,Tim Paterson
4,5,Julia,Paige,Julia Paige
...,...,...,...,...
3995,3996,Luke,Mackenzie,Luke Mackenzie
3996,3997,William,Duncan,William Duncan
3997,3998,Boris,Watson,Boris Watson
3998,3999,Nicola,Randall,Nicola Randall


In [19]:
df2 = df[df['ID'] <= 100]
df2.to_csv('small_customers.csv')

In [20]:
s3.upload_file('small_customers.csv', 'splice-demo', 'small_customers.csv')