# Imports

In [65]:
import pandas as pd
import numpy as np
import torch
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from ast import literal_eval
import matplotlib.pyplot as plt
from textstat.textstat import textstat
from gensim.corpora import wikicorpus
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nltk
from collections import defaultdict
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import HashingVectorizer
%matplotlib inline
# Make it pretty
plt.style.use('ggplot')

# Import file

In [3]:
file = '../data/enwiki.observations.text_wp10.30k.tsv'
raw_data = pd.read_csv(file, sep='\t', header=None)
data = pd.DataFrame(data=list(raw_data[0].apply(literal_eval)))

# Drop all rows with incorrect labels

In [5]:
data = data[data['text'] != ""]
data = data[data['text'].str.contains("#redirect") == False]
data = data[data['text'].str.contains("may refer to:\n\n*") == False]
data = data[data['text'].str.contains("can refer to:\n") == False]
data = data[data['text'].str.contains("could refer to:\n") == False]
data = data[data['text'].str.contains("#REDIRECT") == False]
data = data[data['text'].str.contains("== Matches ==\n:") == False]
data = data[data['text'].str.contains("{{underconstruction") == False]

# Make y's numeric

In [6]:
classes = {"stub": 0, "start": 1, "c": 2, "b": 3, "ga": 4, "fa": 5} 
data["label"] = data['label'].map(classes)

# Feature Engineering Functions

In [7]:
def clean_wiki_markup(raw_article):
    semi_cleaned_article = wikicorpus.filter_wiki(raw_article)
    cleaned_article = semi_cleaned_article.replace("\n", "").replace("\'", "").replace("()", "").replace("=", "").replace("|alt","").replace("\xa0","")
    return cleaned_article
def find_num_categories(raw_article):
    return raw_article.count("[[Category:")
def find_num_images(raw_article):
    return raw_article.count("[[Image:")
def find_num_ISBN(raw_article):
    return raw_article.count("ISBN")
def find_num_references(raw_article):
    return raw_article.count("</ref>")
def find_article_length(cleaned_article):
    return len(cleaned_article)
def find_num_difficult_words(cleaned_article):
    return textstat.difficult_words(cleaned_article)
def find_dale_chall_readability_score(cleaned_article):
    return textstat.dale_chall_readability_score(cleaned_article)
def find_automated_readability_index(cleaned_article):
    return textstat.automated_readability_index(cleaned_article)
def find_linsear_write_formula(cleaned_article):
    return textstat.linsear_write_formula(cleaned_article)
def find_gunning_fog_index(cleaned_article):
    return textstat.gunning_fog(cleaned_article)
def find_syllable_count(cleaned_article):
    return textstat.syllable_count(cleaned_article)
def find_lexicon_count(cleaned_article):
    return textstat.lexicon_count(cleaned_article, removepunct=True)
def find_sentence_count(cleaned_article):
    return textstat.sentence_count(cleaned_article)
def find_smog_index(cleaned_article):
    return textstat.smog_index(cleaned_article)
def find_num_web_citations(raw_article):
    return raw_article.count("{{cite web")
def find_num_book_citations(raw_article):
    return raw_article.count("{{cite book")
def find_num_news_citations(raw_article):
    return raw_article.count("{{cite news")
def find_num_quotes(raw_article):
    return raw_article.count("quote=")
def find_num_h3_headers(raw_article):
    return raw_article.count("\n===")
def find_num_internal_links(raw_article):
    return (raw_article.count("[[") // 2)
def find_num_h2_headers(raw_article):
    return (raw_article.count("\n==") - find_num_h3_headers(raw_article))
def find_num_note_tags(raw_article):
    return raw_article.count("{{note")
def find_num_bullet_points(raw_article):
    return (raw_article.count("*"))
def find_num_underlines(raw_article):
    return (raw_article.count("<u>"))
def find_num_journal_citations(raw_article):
    return (raw_article.count("{{cite journal"))
def find_num_about_links(raw_article):
    return (raw_article.count("{{About"))
def find_num_wikitables(raw_article):
    return (raw_article.count('class="wikitable'))
def find_num_footnotes(raw_article):
    return raw_article.count("{{")
def find_infobox(raw_article):
    return int('{{Infobox' in raw_article)

In [8]:
data['cleaned_text'] = data['text'].apply(clean_wiki_markup)
data['num_web_citations'] = data['text'].apply(find_num_web_citations)
data['num_book_citations'] = data['text'].apply(find_num_book_citations)
data['num_news_citations'] = data['text'].apply(find_num_news_citations)
data['num_quotes'] = data['text'].apply(find_num_quotes)
data['num_h3_headers'] = data['text'].apply(find_num_h3_headers)
data['num_internal_links'] = data['text'].apply(find_num_internal_links)
data['num_h2_headers'] = data['text'].apply(find_num_h2_headers)
data['has_infobox'] = data['text'].str.contains('{{Infobox').astype(int)
data['num_categories'] = data['text'].apply(find_num_categories)
data['num_images'] = data['text'].apply(find_num_images)
data['num_ISBN'] = data['text'].apply(find_num_ISBN)
data['num_references'] = data['text'].apply(find_num_references)
data['article_length'] = data['text'].apply(find_article_length)
data['num_difficult_words'] = data['cleaned_text'].apply(find_num_difficult_words)
data['dale_chall_readability_score'] = data['cleaned_text'].apply(find_dale_chall_readability_score)
data['readability_index'] = data['cleaned_text'].apply(find_automated_readability_index)
data['linsear_write_formula'] = data['cleaned_text'].apply(find_linsear_write_formula)
data['gunning_fog_index'] = data['cleaned_text'].apply(find_gunning_fog_index)
data['smog_index'] = data['cleaned_text'].apply(find_smog_index)
data['syllable_count'] = data['cleaned_text'].apply(find_syllable_count)
data['lexicon_count'] = data['cleaned_text'].apply(find_lexicon_count)
data['sentence_count'] = data['cleaned_text'].apply(find_sentence_count)
data['num_footnotes'] = data['text'].apply(find_num_footnotes)
data['num_note_tags'] = data['text'].apply(find_num_note_tags)
data['num_underlines'] = data['text'].apply(find_num_underlines)
data['num_journal_citations'] = data['text'].apply(find_num_journal_citations)
data['num_about_links'] = data['text'].apply(find_num_about_links)
data['num_wikitables'] = data['text'].apply(find_num_wikitables)

Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(DCRS): Word Count is zero cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(ARI) : Sentence count is zero, cannot divide
Error(GF): Word Count is Zero, cannot divide
Error(GF): Word Count is Zero, cannot divide
Error(GF): Word Count is Zero, cannot

# Save DataFrame

In [None]:
data.to_csv('wiki_train.tsv', sep='\t')

In [161]:
backup_data = data

In [162]:
backup_data.dropna(inplace=True)

In [165]:
random_forest_data = backup_data.loc[:, ['label', 'has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                'lexicon_count', 'sentence_count','num_footnotes', 'num_note_tags', 'num_underlines', 'num_journal_citations',
                'num_about_links', 'num_wikitables', 'smog_index']]

In [166]:
random_forest_data.to_csv('random_forest_data.csv')

# Train first two models

In [35]:
import pickle

In [164]:
print(data.shape)
print(backup_data.shape)

(29344, 34)
(29344, 34)


# Train / Test split

In [167]:
feature_engineered_X = backup_data.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                'lexicon_count', 'sentence_count','num_footnotes', 'num_note_tags', 'num_underlines', 'num_journal_citations',
                'num_about_links', 'num_wikitables', 'smog_index']].values
y = backup_data['label'].values

(29344,)

# 1) Random Forest w/ hand engineered features

In [170]:
X_train, X_test, y_train, y_test = train_test_split(feature_engineered_X, y, test_size=0.20, random_state=910)

Using Scikit-Learn’s RandomizedSearchCV method, we can define a grid of hyperparameter ranges, and randomly sample from the grid, performing K-Fold CV with each combination of values.

The most important settings are the number of trees in the forest **(n_estimators)** and the number of features considered for splitting at each leaf node **(max_features)**.

We will try adjusting the following set of hyperparameters:
- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)

In [171]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 100, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [10, 50, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [177]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=0, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [20, 28, 37, 46, 55, 64, 73, 82, 91, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, None], 'min_samples_split': [10, 50, 100], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [178]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy
base_model = RandomForestRegressor(n_estimators = 10, random_state = 910)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_train, y_train)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_train, y_train)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


Model Performance
Average Error: 0.2539 degrees.
Accuracy = nan%.
Model Performance
Average Error: 0.2106 degrees.
Accuracy = nan%.
Improvement of nan%.


# Save Random Forest!!!!

In [183]:
filename = 'random_forest_model.sav'
pickle.dump(best_random, open(filename, 'wb'))

In [182]:
type(best_random)

sklearn.ensemble.forest.RandomForestRegressor

In [181]:
print(mean_squared_error(y_test, preds))

0.6744901996151536


# 2) Random Forest w/ hash vectors

In [66]:
hash_vectorizer = HashingVectorizer(n_features=5000)

In [67]:
hash_X_train, hash_X_test, hash_y_train, hash_y_test = train_test_split(hash_vec_X, y, test_size=0.20, random_state=910)

In [68]:
hash_vectorizer.fit(hash_X_train)

HashingVectorizer(alternate_sign=True, analyzer='word', binary=False,
         decode_error='strict', dtype=<class 'numpy.float64'>,
         encoding='utf-8', input='content', lowercase=True,
         n_features=5000, ngram_range=(1, 1), non_negative=False,
         norm='l2', preprocessor=None, stop_words=None, strip_accents=None,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None)

In [72]:
hash_X_transformed = hash_vectorizer.transform(hash_X_train.ravel())

In [73]:
hash_rf = RandomForestRegressor(bootstrap=True, max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=35,
           oob_score=False, random_state=910, verbose=0, warm_start=False)

In [74]:
hash_rf.fit(hash_X_transformed, hash_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=35, n_jobs=1,
           oob_score=False, random_state=910, verbose=0, warm_start=False)

In [76]:
hash_X_test_transformed = hash_vectorizer.transform(hash_X_test.ravel())

In [77]:
hash_predictions = hash_rf.predict(hash_X_test_transformed)

In [78]:
print(mean_squared_error(hash_y_test, hash_predictions))

0.9546654979190564


# Ensemble Model

In [79]:
ensemble_data = data[14672:]

In [80]:
feature_engineered_X = ensemble_data.loc[:, ['has_infobox','num_categories','num_images','num_ISBN','num_references','article_length',
                'num_difficult_words','dale_chall_readability_score','readability_index','linsear_write_formula',
                'gunning_fog_index', 'num_web_citations','num_book_citations','num_news_citations',
                'num_quotes','num_h3_headers','num_internal_links', 'num_h2_headers', 'syllable_count',
                'lexicon_count', 'sentence_count','num_footnotes', 'num_note_tags', 'num_underlines', 'num_journal_citations',
                'num_about_links', 'num_wikitables', 'smog_index']].values
hash_vec_X = ensemble_data.loc[:, ['cleaned_text']].values
y = ensemble_data['label'].values

## Find Article Features Random Forest For All Predictions

In [82]:
article_features_rf_preds = article_features_rf.predict(feature_engineered_X)

## Find Hash Random Forest For All Predictions

In [85]:
hash_ensemble_transformed = hash_vectorizer.transform(hash_vec_X.ravel())

In [86]:
hash_ensembe_predictions = hash_rf.predict(hash_ensemble_transformed)

In [92]:
ensemble_data['feature_rf_preds'] = article_features_rf_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [90]:
ensemble_data['hash_rf_preds'] = hash_ensembe_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [97]:
ensemble_X = ensemble_data.loc[:, ['feature_rf_preds', 'hash_rf_preds']].values

In [101]:
ensemble_y = ensemble_data['label'].values

# Ensemble Model: Linear Regression

## Train / Test Split

In [103]:
ensemble_X_train, ensemble_X_test, ensemble_y_train, ensemble_y_test = train_test_split(ensemble_X, ensemble_y, test_size=0.20, random_state=910)

In [104]:
from sklearn import linear_model

In [105]:
ensemble_linear_model = linear_model.LinearRegression()

In [106]:
ensemble_linear_model.fit(ensemble_X_train, ensemble_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [107]:
ensemble_preds = ensemble_linear_model.predict(ensemble_X_test)

In [159]:
print(mean_squared_error(ensemble_X_train[:, 0], ensemble_y_train))
print(mean_squared_error(ensemble_X_train[:, 1], ensemble_y_train))

0.7605524886016601
0.9627956012855443


In [156]:
ensemble_X_train.shape

(11737, 2)

In [152]:
ensemble_preds_train = ensemble_linear_model.predict(ensemble_X_train)

In [153]:
print(mean_squared_error(ensemble_y_train, ensemble_preds_train))

0.7467782399791958


In [108]:
print(mean_squared_error(ensemble_y_test, ensemble_preds))

0.8041753967782823


In [112]:
ensemble_data['acv_preds'] = ((ensemble_data['feature_rf_preds'] + ensemble_data['hash_rf_preds']) / 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [114]:
print(mean_squared_error(ensemble_data['label'].values, ensemble_data['acv_preds'].values))

0.7911875856457949


In [117]:
why = ensemble_data.loc[:, ['label','feature_rf_preds', 'hash_rf_preds']]

In [118]:
why

Unnamed: 0,label,feature_rf_preds,hash_rf_preds
14738,2,1.755046,2.147767
14739,4,3.399567,3.525983
14740,5,4.270000,2.804943
14741,5,4.302981,3.752219
14742,4,2.714286,3.157665
14743,1,1.423449,1.003858
14744,5,4.609352,3.303246
14745,2,3.662006,3.350708
14746,4,3.646154,3.706150
14747,5,3.400000,3.586730


In [119]:
ensemble_linear_model.coef_

array([0.8153928 , 0.21688293])

In [123]:
print(backup_data['text'][0])

[[Image:GD-FR-Paris-Louvre-Sculptures034.JPG|320px|thumb|Tomb of Philippe Pot, governor of [[Burgundy (region)|Burgundy]]  under [[Louis XI]]|alt=A large sculpture of six life-sized black-cloaked men, their faces obscured by their hoods, carrying a slab upon which lies the supine effigy of a knight, with hands folded together in prayer. His head rests on a pillow, and his feet on a small reclining lion.]]
[[File:Sejong tomb 1.jpg|thumb|320px|Korean tomb mound of King [[Sejong the Great]], d. 1450]]
[[Image:Istanbul - Süleymaniye camii - Türbe di Roxellana - Foto G. Dall'Orto 28-5-2006.jpg|thumb|320px|[[Türbe]] of [[Roxelana]] (d. 1558), [[Süleymaniye Mosque]], [[Istanbul]]]]
'''Funerary art''' is any work of [[art]] forming, or placed in, a repository for the remains of the [[death|dead]]. [[Tomb]] is a general term for the repository, while [[grave goods]] are objects—other than the primary human remains—which have been placed inside.<ref>Hammond, 58–9 characterizes [[Dismemberment|di

In [136]:
%pdb
for i in range(len(backup_data)):
    backup_data.loc[i,:].to_csv('wiki_train.tsv', sep='\t', encoding='utf-8')

Automatic pdb calling has been turned ON


UnicodeEncodeError: 'utf-8' codec can't encode characters in position 67334-67353: surrogates not allowed

> [0;32m/Users/austin/Documents/Galvanize/Capstone/Wikipedia_Knowledge_Graph/src/pandas/_libs/writers.pyx[0m(84)[0;36mpandas._libs.writers.write_csv_rows[0;34m()[0m

ipdb> u
> [0;32m/Users/austin/anaconda3/lib/python3.6/site-packages/pandas/io/formats/csvs.py[0m(313)[0;36m_save_chunk[0;34m()[0m
[0;32m    309 [0;31m                                        [0mdate_format[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0mdate_format[0m[0;34m,[0m[0;34m[0m[0m
[0m[0;32m    310 [0;31m                                        quoting=self.quoting)
[0m[0;32m    311 [0;31m[0;34m[0m[0m
[0m[0;32m    312 [0;31m        libwriters.write_csv_rows(self.data, ix, self.nlevels,
[0m[0;32m--> 313 [0;31m                                  self.cols, self.writer)
[0m
ipdb> u
> [0;32m/Users/austin/anaconda3/lib/python3.6/site-packages/pandas/io/formats/csvs.py[0m(286)[0;36m_save[0;34m()[0m
[0;32m    284 [0;31m                [0;32mbreak[0m[0;34m[0m[0m
[0m[0;32m    285 [0;

In [142]:
backup_data.loc[82,'cleaned_text'][67334:67353]

''

In [139]:
'י'.encode()

b'\xd7\x99'

In [144]:
backup_data.to_pickle('backup_data.pkl')

In [150]:
backup_data.to_parquet('backup_data.parq')

RuntimeError: Compression 'snappy' not available.  Options: ['GZIP', 'UNCOMPRESSED']

> [0;32m/Users/austin/anaconda3/lib/python3.6/site-packages/fastparquet/compression.py[0m(131)[0;36mcompress_data[0;34m()[0m
[0;32m    129 [0;31m    [0;32mif[0m [0malgorithm[0m[0;34m.[0m[0mupper[0m[0;34m([0m[0;34m)[0m [0;32mnot[0m [0;32min[0m [0mcompressions[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m    130 [0;31m        raise RuntimeError("Compression '%s' not available.  Options: %s" %
[0m[0;32m--> 131 [0;31m                (algorithm, sorted(compressions)))
[0m[0;32m    132 [0;31m    [0;32mif[0m [0margs[0m [0;32mis[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m    133 [0;31m        [0;32mreturn[0m [0mcompressions[0m[0;34m[[0m[0malgorithm[0m[0;34m.[0m[0mupper[0m[0;34m([0m[0;34m)[0m[0;34m][0m[0;34m([0m[0mdata[0m[0;34m)[0m[0;34m[0m[0m
[0m
ipdb> q


In [146]:
!pip install fastparquet

Collecting fastparquet
[?25l  Downloading https://files.pythonhosted.org/packages/46/b2/ad083ff3873384b86c180b0f88e3a8f6f097aba8f48a77cadbc24806b395/fastparquet-0.1.6-cp36-cp36m-macosx_10_7_x86_64.whl (174kB)
[K    100% |████████████████████████████████| 184kB 2.7MB/s ta 0:00:01
[?25hCollecting thrift>=0.11.0 (from fastparquet)
[?25l  Downloading https://files.pythonhosted.org/packages/c6/b4/510617906f8e0c5660e7d96fbc5585113f83ad547a3989b80297ac72a74c/thrift-0.11.0.tar.gz (52kB)
[K    100% |████████████████████████████████| 61kB 4.6MB/s ta 0:00:01
Collecting pytest-runner (from fastparquet)
  Downloading https://files.pythonhosted.org/packages/72/a4/d7a5738a3096f22a98bec1609e237b250ebff04e5ea2930305d485337263/pytest_runner-4.2-py2.py3-none-any.whl
Building wheels for collected packages: thrift
  Running setup.py bdist_wheel for thrift ... [?25ldone
[?25h  Stored in directory: /Users/austin/Library/Caches/pip/wheels/be/36/81/0f93ba89a1cb7887c91937948519840a72c0ffdd57cac0ae8f
Succ

In [149]:
! yes | conda install python-snappy

Solving environment: done

## Package Plan ##

  environment location: /Users/austin/anaconda3

  added / updated specs: 
    - python-snappy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.5.10               |           py36_0         1.0 MB
    python-snappy-0.5.2        |   py36h0a44026_0          27 KB
    openssl-1.0.2p             |       h1de35cc_0         3.4 MB
    ca-certificates-2018.03.07 |                0         124 KB
    certifi-2018.8.13          |           py36_0         138 KB
    ------------------------------------------------------------
                                           Total:         4.7 MB

The following NEW packages will be INSTALLED:

    python-snappy:   0.5.2-py36h0a44026_0         

The following packages will be UPDATED:

    ca-certificates: 2018.03.07-0         anaconda --> 2018.03.07-0     
    certifi:         2018.8.13-py36_0     a

In [151]:
!du -m backup_data.pkl

1035	backup_data.pkl
