In [None]:
from IPython.display import display, HTML

# Add scrollable CSS styling for outputs
display(HTML("""
<style>
.output_scroll {
    max-height: 400px;
    overflow-y: auto;
    border: 1px solid #ddd;
    padding: 10px;
}
</style>
"""))

# Navigation

Open STATA to its landing page

In [None]:
import os
os.chdir("/Applications/Stata/utilities")
from pystata import config
config.init("se")

## Calculations without a dataset
STATA supports simple arithmetic

In [None]:
%%stata
di 5 * 2
di 10 / 2

## Opening a .dta dataset

Open your own .dta or csv file in STATA using "use FILE.dta"<br>
STATA also contains built-in and online datasets using "sysuse FILE.dta" or "webuse FILE.dta" respectively

In [None]:
%%stata
sysuse auto, clear
sum mpg

In [None]:
import pandas as pd

babies = '/Users/mujiechen/Jupyter-Notebook/STATA/Datasets/babies.dta'
babies = pd.read_stata(babies)
print(babies.head())

import io
import requests

data = requests.get("https://www.stata.com/python/pystata18/misc/nhanes2.csv").content
nhanes2 = pd.read_csv(io.StringIO(data.decode("utf-8")))
nhanes2

## Display All Data
Using "br" or "br VARIABLE"

In [None]:
%%stata -d babies
list _all

## One Way Table Showing Frequency of Variable

In [None]:
%%stata
tab bwt

## Two Way Table Showing Frequency of Two Variables
Summarises the relationship between two categorical variables, where the first argument is shown in the row, and the second argument is shown in the column<br>
", row" shows within-row relative frequencies in %

In [None]:
%%stata
tab bwt gest, row

## Reveal Structure and Properties of Dataset (a la .schema for SQL)
Each variable has:
1. Variable name, storage type (int, byte, long, float, double etc.)
2. Display format
3. Value label (key-value pairs that store numerical values but display text)
4. Variable label (a longer description of the variable e.g. specifying units)

In [None]:
%%stata
describe

## Show any Notes attached to the Dataset

In [None]:
%%stata
notes

## Show Variable Name, Label, and some Summary Statistics
Shows if all values are unique, contains blanks, are missing any values ("." or ".a" or "-.z")<br>
If values are unique, the variable can possibly be used as an identifier<br>
If the variable is an <i>indicator</i> variable, this function will reveal key-value pairs<br>
Indicator variables use less memory and can be worked into statistical models

In [None]:
%%stata
codebook bwt

## Creating and Deleting Variables

In [None]:
%%stata

gen bwt_mg = bwt*1000
drop bwt_mg

gen bwt_kg = bwt/1000
label variable bwt_kg "birth weight in kilograms"
drop if bwt_kg < 2
# Can be used to remove outliers in data

## Dichotomise (or Stratify) Data

In [None]:
%%stata

gen bwt_strat = .  // Create the variable bwt_strat and initialize with missing values
replace bwt_strat = 0 if bwt_kg < 3  // Values in the low group become 0
replace bwt_strat = 1 if bwt_kg >= 3 & bwt_kg != .  // Values in the high group become 1

## Recode Data into Multiple Groups
Specify the categories or bins in brackets

In [None]:
%%stata

recode bwt_kg (0=0) (2/2.5=1) (2.5/3=2) (3/3.5=3) (3.5/max=4), gen(bwt_500g_splits)