# Index

- [Data Backups](#data-backups)
- [Regex](#regex)
- [Pandas](#pandas)
- [Web parsing](#web-parsing)
- [Visualisation](#visualisation)
- [Pyspark](#pyspark)
- [Applied ML](#applied-ml)
- [Unsupervised Learning](#unsupervised-learning)
- [Natural Language Processing](#natural-language-processing)
- [Statistics](#statistics)
- [Graph Networks](#graph-networks)


# Importing Libraries

In [None]:
# Data Manipulation Libraries
import pickle
import pandas as pd
import numpy as np
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle
import math
from collections import Counter
from os import listdir
import os, codecs, string, random
from numpy.random import randint

# Visualisation Libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime, date, time
from dateutil.parser import parse
from pandas.plotting import scatter_matrix
from tqdm.notebook import tqdm

# Web parsing Libraries
import requests
from bs4 import BeautifulSoup

# Big Data Libraries
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

# Machine Learning Libraries
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cosine
from scipy.spatial.distance import jaccard
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegressionCV

# NLP Libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from gensim.models.phrases import Phrases
import re

# Statistics
from scipy.stats import ttest_ind
from scipy.stats import wilcoxon
from scipy.stats import kstest
from scipy.stats import pearsonr
from sklearn.feature_selection import mutual_info_classif
import itertools

# Graphs
import networkx as nx
from operator import itemgetter
from community import community_louvain
import collections
from networkx.algorithms.community.centrality import girvan_newman

In [None]:
# Concatenating rows of a column that each are a list
words_for_chars = pd.concat([pd.Series(row["Character"], row['Line'].split())
                             for _, row in train_set.iterrows()]).reset_index()

# Data backups
Dont forget to create a "backup folder"

In [None]:
def read_data(index):
    file = './backup/data_' + index + '.pkl'
    with open(file, 'rb') as fp:
        return pickle.load(fp)

In [None]:
def save_data(data, index):
    file = './backup/data_' + index + '.pkl'
    with open(file, 'wb') as fp:
        pickle.dump(data, fp)

In [None]:
# Milestone
index += 1
save_data(bbt, str(index))
print(index)

# Regex

In [None]:
# Find word in text
def word_in_text(words, text):
    words = re.sub('s+','s*', '|'.join(words)) # Find pattern, replace with other, the list
    text = text.lower()
    match = re.search(words, text)
    if match:
        return True
    return False

In [None]:
# delete substring until certain character
x : x[x.find(':'):]

<span style="color:royalblue">    
    Find all lines containing a word <br>
    Counting tweets containing a word
</span>

In [None]:
# actually apply the regexp
keywords = ['open access', 'open science', 'ipython', 'open data', 'reproducible research','epfl']
for w in keywords:
    tweets[w] = tweets['text'].apply(lambda tweet: word_in_text([w], tweet))

tweets_by_kw = pd.Series([tweets[w].value_counts()[True] for w in keywords], index=keywords)

# Pandas

In [None]:
# Groupby function
df.groupby('A').apply(lambda x: x.sum())

<span style="color:royalblue">    
    Standardise times
</span>

In [None]:
# Creating date
datetime(1970, 1, 1) # yyyy - mm -dd

# Parsing an entire column of date
df.column.apply(lambda d: datetime.strptime(d, '%m/%d/%y %H:%M')).head(10)
# Filter per month
df[df.column.dt.month==2].head()

<span style="color:royalblue">
Rotates / reshapes DataFrames (wide ⇄ long).


**stack** — Moves column labels into a lower-level row index (wide → long).
Example: `df.stack()` turns columns into an inner index and typically returns a Series.

**unstack** — Inverse of `stack`: moves an index level into columns (long → wide).
Example: If `s = df.stack()`, then `s.unstack()` restores the original `df` (when pairs are unique).

**pivot** — Reshapes long-form columns into a wide table by specifying `index`, `columns`, and `values`.
Requires unique (`index`,`columns`) pairs; otherwise use `pivot_table` with an aggregation.
Example: `df.pivot(index='date', columns='var', values='val')` creates one column per `var`.
</span>

In [None]:
df.stack() # Columns into rows
df.unstack() # Reverts the effect

df_wide = df.pivot(index='id_column', columns='column to pivot', values='twstrs').head()

df.explode(column=column)

# crosstab for contingency tables
pd.crosstab(df['col1'], df['col2'], margins=True)

# merge / join examples
pd.merge(left_df, right_df, how='left', on='key')
left_df.join(right_df.set_index('key'), on='key', how='inner')

# concat multiple dataframes (stack vertically/horizontally)
pd.concat([df1, df2], axis=0, ignore_index=True)
pd.concat([df1, df2], axis=1)

# hstack porcodio
X2 = np.hstack((X, merged_df_[["PP", "NN", "PN", "N", "P"]].values))

# reshape long -> wide / wide -> long
df.melt(id_vars=['id'], value_vars=['A','B'], var_name='variable', value_name='value')

In [None]:
# Index and renaming
df = df.rename(columns={'Historical Significance': 'Role'})
df.set_index('index_column', inplace=True)

df.set_index(pd.to_datetime(df['date'])).resample('M').sum()
df.reset_index(drop=True, inplace=True)

In [None]:
# Examples: grouping & aggregations
df.groupby('group_col').agg({'val1': ['mean', 'sum', 'sem'], 'val2': 'max'})

# Named aggregations (pandas 0.25+)
df.groupby('group_col').agg( mean_val1=('val1', 'mean'), sum_val2=('val2', 'sum'))

In [None]:
# Group size and count
df.groupby('group_col').size()
df.groupby('group_col')['val1'].count()
df['count_values'] = df['col'].value_counts()  # returns Series (use map to add to df)

In [None]:
# transform vs apply: keep index alignment
df['val1_pct_of_group'] = df.groupby('group_col')['val1'].transform(lambda x: x / x.sum())

In [None]:
# cumsum, rank, standardized within group
df['cum'] = df.groupby('group_col')['val1'].cumsum()
df['rank'] = df.groupby('group_col')['val1'].rank(ascending=False)

# pivot_table with aggregation and fill_value
pd.pivot_table(df, index='date', columns='category', values='value', aggfunc='mean', fill_value=0)

In [None]:
# useful selectors
df.query("colA > 10 and colB == 'X'")
df[df['col'].isin(['a','b','c'])]

df.nlargest(5, 'score')
df.nsmallest(5, 'score')

In [None]:
# time series / rolling windows
df['rolling_mean'] = df['value'].rolling(window=7, min_periods=1).mean()
df['ewm'] = df['value'].ewm(span=7).mean() # exponential moving window

In [None]:
# duplicates
df['is_dup'] = df.duplicated(subset=['col1','col2'])
df.drop_duplicates(subset=['col1','col2'], keep='first')

# Web parsing

In [None]:
xr = requests.get('http://worldtimeapi.org/api/timezone/Europe/Zurich')
r.json()
r.text[:300]

In [None]:
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post('https://httpbin.org/post', data=payload)
r.json()

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')
soup.h1
soup.title.string
all_links = soup.find_all('a')

for link in all_links:
    if(not link.get('href').startswith('http://dblp.uni-trier.de/')
       and link.get('href').startswith('http')):  # just an example, you need more checks
        external_links += 1

publications_wrappers = soup.find_all('li', class_='entry')

# Visualisation

In [None]:
# Log-log plot - Log axis with pyplot
plt.xscale('log')
plt.yscale('log')

In [None]:
# Scattering plots to observe relationships between arguments
df.plot(kind='scatter', x='newspaper', y='sales', ax=axs[2], grid=True)

In [None]:
# Histograms / density
plt.hist(data, bins=30, color='skyblue', edgecolor='k')
plt.xlabel('value')
plt.ylabel('count')

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ax.plot(x, y, label='line')
ax.scatter(x, y, c='C1', alpha=0.7, label='points')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Line + Scatter')
ax.legend()
plt.grid(True)

In [None]:
# Bar plot (categorical counts)
counts = df['cat'].value_counts()
counts.plot(kind='bar', color='coral')

In [None]:
# Boxplot / violinplot (with seaborn)
sns.set_theme(style='whitegrid')
sns.boxplot(x='category', y='value', data=df)
sns.violinplot(x='category', y='value', data=df)

In [None]:
# KDE / hist with seaborn
sns.histplot(df['value'], kde=True, bins=40)
sns.kdeplot(df['value'], shade=True)

# Countplot / barplot
sns.countplot(x='category', data=df)
sns.barplot(x='category', y='value', data=df, ci=95)

In [None]:
# Scatter with regression line
sns.regplot(x='xcol', y='ycol', data=df, scatter_kws={'s':10}, line_kws={'color':'red'})

# ECDF (used in exams)
sns.ecdfplot(data=df['degree'])

In [None]:
# Pairplot / jointplot / heatmap
sns.pairplot(df[['a','b','c']])
sns.jointplot(x='a', y='b', data=df, kind='hex')
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='vlag')

In [None]:
# Subplots grid
fig, axs = plt.subplots(1, 3, figsize=(12,3))
axs[0].plot(x, y1)
axs[1].plot(x, y2)
axs[2].plot(x, y3)
for ax in axs:
    ax.grid(True)
plt.tight_layout()

In [None]:
# Annotations and reference lines
plt.axvline(0, color='k', linestyle='--')
plt.axhline(0.5, color='gray', linestyle=':')
ax.annotate('peak', xy=(x[idx], y[idx]), xytext=(x[idx]+1, y[idx]+1),
            arrowprops=dict(arrowstyle='->'))

In [None]:
# Colorbar for imshow/heatmap
im = ax.imshow(matrix, cmap='viridis')
fig.colorbar(im, ax=ax)

## Error Bars (always show them)

In [None]:
# Comprehensive error-bar examples
import numpy as np
from scipy.stats import sem
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Simple errorbar (symmetric)
x = np.arange(5)
y = np.array([3., 5., 2., 8., 7.])
yerr = np.array([0.5, 0.8, 0.2, 1.0, 0.6])
plt.figure(figsize=(6,3))
plt.errorbar(x, y, yerr=yerr, fmt='o-', color='C0',
             ecolor='gray', elinewidth=2, capsize=4, capthick=1,
             label='mean ± SE')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True)
plt.show()

# 2) Bar plot with error bars (means + SEM)
labels = ['A','B','C']
means = np.array([2.3, 3.1, 4.0])
sems = np.array([0.2, 0.25, 0.15])
plt.figure(figsize=(5,3))
plt.bar(labels, means, yerr=sems, capsize=6, color='skyblue', edgecolor='k')
plt.ylabel('value')
plt.title('Mean ± SEM')
plt.show()

# 3) Asymmetric errors (lower, upper)
lower = np.array([0.4,0.2,0.6])
upper = np.array([0.8,0.3,0.4])
asym_err = [lower, upper]
plt.figure()
plt.errorbar([0,1,2], means, yerr=asym_err, fmt='o', capsize=5)
plt.show()

# 4) Compute mean ± 95% CI from raw data (pandas groupby example)
# assumes `df` with columns 'group' and 'value'
# from scipy.stats import sem  # already imported above
grouped = df.groupby('group')['value']
means = grouped.mean()
sems = grouped.apply(lambda x: sem(x, nan_policy='omit'))
ci95 = sems * 1.96
ax = means.plot(kind='bar', yerr=ci95, capsize=5, rot=0)
ax.set_ylabel('mean value')
plt.title('Group means with 95% CI')
plt.show()

# 5) Seaborn helpers (point estimates + CI)
# pointplot shows mean with CI (default 95%)
sns.pointplot(data=df, x='group', y='value', capsize=.1)
# barplot also supports ci (use ci=None to turn off)
sns.barplot(data=df, x='group', y='value', ci=95, capsize=.1)
plt.show()

# 6) Error band for time series (rolling +/- std)
# assumes `ts` is a DataFrame or Series with a DatetimeIndex and column 'value'
rolling_mean = ts['value'].rolling(7, min_periods=1).mean()
rolling_std = ts['value'].rolling(7, min_periods=1).std()
plt.figure(figsize=(8,3))
plt.plot(ts.index, rolling_mean, label='7-day mean')
plt.fill_between(ts.index, rolling_mean - rolling_std, rolling_mean + rolling_std,
                 color='C0', alpha=0.2, label='±1 std')
plt.legend()
plt.show()

# Tips and options:
# - use `capsize` to show caps, `ecolor` to style error lines
# - `elinewidth` and `capthick` control thickness
# - pass asymmetric errors as [lower, upper]
# - for large datasets, consider seaborn's aggregators (pointplot/barplot) or bootstrap CIs
# - to show only every nth errorbar, use `errorevery` in plt.errorbar
# Example: plt.errorbar(x, y, yerr=yerr, errorevery=2)
# - to represent CI visually, `fill_between` often looks cleaner than many caps

# Pyspark

In [None]:
# create the session
spark = SparkSession.builder.getOrCreate()

# create the context
sc = spark.sparkContext

In [None]:
conf = pyspark.SparkConf().setMaster("local[*]").setAll([
                                   ('spark.executor.memory', '12g'),  # find
                                   ('spark.driver.memory','4g'), # your
                                   ('spark.driver.maxResultSize', '2G') # setup
                                  ])
# create the session
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# create the context
sc = spark.sparkContext

In [None]:
query = """
SELECT AirCraftType, count(*) MissionsCount
FROM Bombing_Operations bo
JOIN Aircraft_Glossary ag
ON bo.AirCraft = ag.AirCraft
GROUP BY AirCraftType
ORDER BY MissionsCount DESC
"""

In [None]:
# Mapping function
liste.map(lambda x : x*2)

# Filter function
liste.filter(lambda x : x < 2)

# Flatmap
liste.flatmap(lambda x : [x, x*10])

# Sample
liste.sample(withReplacementBoolean, sampledFraction, seed)

# Union
liste.union(dataset2)

# Intersection
liste.intersection(dataset2)

# Distinct
liste.distinct()

# Groupby
dic.groupByKey()

# Reduce
dic.reduceByKey(sum)

# Sort by key
dic.sortByKey()

# Join {(1,a), (2,b)}.join({(1,A), (1,X)}) → {(1, (a,A)), (1, (a,X))}
dic.join(dic2)

# collect - Return all elements of the dataset as an array
data.collect()

# Count elements in the dataset
data.count()

# Take the first n elements
data.take(n)

# Saving
data.saveAsTextFile(path)

# Applied ML

## Preprocessing

In [None]:
# handle missing data
df.fillna({'col': 0})
df.dropna(subset=['important_col'])
df.interpolate(method='time')  # if index is datetime

In [None]:
# Transformation from categorical to numerical -> One hot encoding
X =  pd.get_dummies(pokemon_features[pokemon_features.columns])

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Standard scaler - Scaling
scaler = StandardScaler()
scaler.fit(data)
scaled_data = scaler.transform(data)

## Linear regression

In [None]:
# Linear Regression
lin_reg = LinearRegression()  # create the model
lin_reg.fit(X, y)  # train it
lin_reg.coef_ # Coefficient values for each attribute

lr = LinearRegression()
# Function for cross validation
predicted = cross_val_predict(lr, X, y, cv=5)

## Logistic Regression

In [None]:
logistic = LogisticRegression(solver='lbfgs')
precision = cross_val_score(logistic, X, y, cv=10, scoring="precision")
recall = cross_val_score(logistic, X, y, cv=10, scoring="recall")



logistic = LogisticRegression(solver='lbfgs')
logistic.fit(X, y)
logistic.predict([[25, 100, 0, 1]])
logistic.predict_proba([[25, 100, 0, 1]])

In [None]:
clf = LogisticRegressionCV(Cs=[], cv=10, random_state=0).fit(X, y)
clf.predict(X[:2, :])
clf.predict_proba(X[:2, :]).shape

## K Nearest Neighbours - Classification
Distance measures available:
* euclidian
* manhattan
* chebyshev
* minkowski (default)
* wminkowski
* seuclidean
* mahalanobis
* jaccard
* hamming

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)
knn.predict([[test_instance]])
knn.predict_proba([[0.9]])

## K Nearest Neighbours - Regression

In [None]:
knn = KNeighborsRegressor(n_neighbors=2)
knn.fit(X, y)
knn.predict([[1.5]])

## Random Forest

In [None]:
n_estimators = 100
max_depth = None

clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

# Feature importance
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

plt.figure()
plt.title("Feature importances")
plt.bar(range(10), importances[indices[:10]], yerr=std[indices[:10]], align="center")
plt.xticks(range(10), indices[:10])
plt.xlim([-1, X_train.shape[1]])
plt.show()

## Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=0)
cross_val_score(dt, iris.data, iris.target, cv=10)

## Feature Selection

In [None]:
# Entropy
def H(p, n):
    temp1 = (p / (p + n)) * np.log2(p / (p + n))
    temp2 = (n / (p + n)) * np.log2(n / (p + n))
    return - temp1 - temp2

def entropy(dataframe, ps, ns):
    '''
    Dataframe should be two columns :
    1: attribute to compute the entropy on, called "attribute"
    2: label of the rows, called "label"
    -> Labels are assumed to be 0 (negative) or 1 (positive)
    '''
    values = dataframe['attribute'].unique()
    entropy_value = 0
    for val in values:
        temp = dataframe[dataframe['attribute'] == val]
        pos = len(temp[temp['label'] == 1])
        neg = len(temp[temp['label'] == 0])
        entropy_value += ((pos + neg) / (ps + ns)) * H(pos, neg)
    return entropy_value

def gain(dataframe):
    ps = len(dataframe[dataframe['label'] == 1])
    ns = len(dataframe[dataframe['label'] == 0])

    return H(ps, ns) - entropy(dataframe, ps, ns)

In [None]:
# Continuous Features
coefficient = pearsonr(x, y)

In [None]:
# Categorical features and label
# Estimated mutual information between each feature and the target
mutual_info_classif(X, y, discrete_features='auto', n_neighbors=3,
                    copy=True, random_state=None)

In [None]:
stat, p = ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate')[source]

In [None]:
stat, p = wilcoxon(x, y=None, zero_method='wilcox', correction=False, alternative='two-sided')

In [None]:
d, p = kstest(rvs, cdf, args=(), N=20, alternative='two-sided', mode='approx')

## Evaluation

In [None]:
# Compute confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_true, y_pred)

# Extract TP, TN, FP, FN
tn, fp, fn, tp = cm.ravel()

In [None]:
# Confidence Interval
np.quantile( np.array(BF), q=[0.025, 0.975])

In [None]:
# MSE: Mean squared error
mean_squared_error(y, predicted)

In [None]:
# Precision cross cal
precision = cross_val_score(logistic, X, y, cv=10, scoring="precision") # scoring="recall"

In [None]:
# R2 scores - R² Scores : 0-> Bad model, 1-> Good model
r2_score(y_true, y_pred, sample_weight=None, multioutput='uniform_average')

In [None]:
# Accuracy
def get_accuracy(tp, tn, fp, fn):
    return (tp + tn) / (tp + tn + fp + fn)

# Precision
def get_precision(tp, tn, fp, fn):
    return (tp) / (tp + fp)

# Recall
def get_recall(tp, tn, fp, fn):
    return (tp) / (tp + fn)

# F1 score - F score
def get_f1score(tp, tn, fp, fn):
    p = get_precision(tp, tn, fp, fn)
    r = get_recall(tp, tn, fp, fn)

    return 2 * ((p * r) / (p + r))

In [None]:
from sklearn import metrics


y_true = df["VOT"].map({-1: 0, 1: 1})
a_y_preds = a_classifier(df)
b_y_preds = b_classifier(df)

a_fpr, a_tpr, threshold = metrics.roc_curve(y_true, a_y_preds)
a_roc_auc = metrics.auc(a_fpr, a_tpr)

b_fpr, b_tpr, threshold = metrics.roc_curve(y_true, b_y_preds)
b_roc_auc = metrics.auc(b_fpr, b_tpr)

fig, ax = plt.subplots(1, 2, figsize=(16, 10))

ax[0].set_title('A Classifier ROC AUC Curve')
ax[0].plot(a_fpr, a_tpr, 'b', label = 'AUC = %0.2f' % a_roc_auc)
ax[0].legend(loc = 'lower right')
ax[0].plot([0, 1], [0, 1],'r--')
ax[0].set_xlim([0, 1])
ax[0].set_ylim([0, 1])
ax[0].set_ylabel('True Positive Rate')
ax[0].set_xlabel('False Positive Rate')

ax[1].set_title('B Classifier ROC AUC Curve')
ax[1].plot(b_fpr, b_tpr, 'b', label = 'AUC = %0.2f' % b_roc_auc)
ax[1].legend(loc = 'lower right')
ax[1].plot([0, 1], [0, 1],'r--')
ax[1].set_xlim([0, 1])
ax[1].set_ylim([0, 1])
ax[1].set_ylabel('True Positive Rate')
ax[1].set_xlabel('False Positive Rate')

plt.show()

# Unsupervised Learning 

In [None]:
# Euclidean distance
euc_dist = euclidean(x, y, w=None)

# Cosine distance
cos_dist = cosine(x, y, w=None)

# Jaccard distance
jac_dist = jaccard(x, y, w=None)

In [None]:
# K means algorithm
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_ # Cluters created
kmeans.predict([[0, 0], [12, 3]])
kmeans.cluster_centers_ # Centers of the predicted clusters

In [None]:
# DBSCAN
dbscan = DBSCAN(eps=3, min_samples=2).fit(X)

# Natural Language Processing

<span style="color:royalblue">    
    All functions here are related to language processing on a series of books. <br>
    The variable "books" is considered to be a list of several books
</span>

In [None]:
# Delete new lines
books = [" ".join(b.split()) for b in books]

In [None]:
# Convert into raw text / spacy object
doc = nlp(book)

In [None]:
# Get entities
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
# Get tokens
tokens = [token.text for token in doc]

In [None]:
# Get Lemmas
for token in doc:
    print(token.text,'--->',token.lemma_)

In [None]:
# Part of Speech Tagging - POS
# Tag is more specific
pos_tagged = [(token.text, token.pos_, token.tag_) for token in doc]

print(example,'\n')
print(pos_tagged)

print(spacy.explain('CCONJ')) # Explanation of a grammatical class

In [None]:
# Frequent words
word_freq = Counter(words)
common_words = word_freq.most_common()

In [None]:
# Bigrams
bigram = Phrases(docs, min_count=15)

In [None]:
# TF-IDF ~ tf idf
tfids_vec = TfidfVectorizer()
X = tfids_vec.fit_transform(corpus)# Corpus : list of strings
print(tfids_vec.get_feature_names())

# Bag of Words - Count vectoriser
bow_vec = CountVectorizer(ngram_range=(2, 2))
X = bow_vec.fit_transform(corpus)
print(bow_vec.get_feature_names())
print(X.toarray())

In [None]:
# Latent Semantic Analysis - LSA
lsa = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
lsa.fit(X)
print(lsa.explained_variance_ratio_)
print(lsa.explained_variance_ratio_.sum())
print(lsa.singular_values_)

In [None]:
# Latent Dirichlet Allocation - LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
print(lda.predict([[-0.8, -1]]))

### Entity Types
* PERSON People, including fictional.
* NORP Nationalities or religious or political groups.
* FAC Buildings, airports, highways, bridges, etc.
* ORG Companies, agencies, institutions, etc.
* GPE Countries, cities, states.
* LOC Non-GPE locations, mountain ranges, bodies of water.
* PRODUCT Objects, vehicles, foods, etc. (Not services.)
* EVENT Named hurricanes, battles, wars, sports events, etc.
* WORK_OF_ART Titles of books, songs, etc.
* LAW Named documents made into laws.
* LANGUAGE Any named language.
* DATE Absolute or relative dates or periods.
* TIME Times smaller than a day.
* PERCENT Percentage, including "%".
* MONEY Monetary values, including unit.
* QUANTITY Measurements, as of weight or distance.
* ORDINAL "first", "second", etc.
* CARDINAL Numerals that do not fall under another type. 

In [None]:
# Stopwords
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
stop_words = [token.text for token in doc if token.is_stop]

# Statistics

In [None]:
# variance
variance = np.var(x)

# Standard Deviation
std = np.std(x)

In [None]:
# Generate random number
seed = randint(low, high)

In [None]:
# randomly sample dataset
df.sample(frac=1, replace=True)

In [None]:
# Bootstraps - Bootstrapping - list
mean_bootstraps = []
yerrs_list1 = []
yerrs_list2 = []
bootstrap_datasets = []
for bootstrap_dataset in df['attribute'].unique():
    bootstrap_datasets.append(bootstrap_dataset)
    bootstrap_scores = []
    n = 1000
    df = df[df['attribute'] == bootstrap_dataset]
    for i in range(n):
        # Extraction of the sample
        indices_sample = np.random.choice(list(range(len(df))),
                                          len(df), replace=True)
        df_sample = df.iloc[indices_sample]

        # Bootstrap list
        bootstrap_scores.append(np.mean(df_sample['nr_words']))


    # Computation on the whole dataset
    bootstrap_scores = np.sort(bootstrap_scores)
    mean_bootstrap = np.mean(bootstrap_scores)
    mean_bootstraps.append(mean_bootstrap)
    lower_bootstrap = (bootstrap_scores - mean_bootstrap)[int(n * 0.025)]
    higher_bootstrap = (bootstrap_scores - mean_bootstrap)[int(n * 0.025)]

    yerrs = [(lower_bootstrap), (higher_bootstrap)]
    yerrs_list1.append(yerrs[0])
    yerrs_list2.append(yerrs[1])


yerrs_list = [yerrs_list1, yerrs_list2]
plt.figure(figsize=(25, 8))
# Can change mean and yerrs to array for several barplots
plt.bar(x = list(range(19)), height = mean_bootstraps, yerr=yerrs_list, color='paleturquoise')
plt.xticks(list(range(19)), bootstrap_datasets)
plt.xlim(-0.5, 20.5)
plt.title('')
plt.xlabel('Mean (*) and 95% confidence interval (-)')
plt.ylabel("Mean square error from the bootstrap sampling")
plt.show()

In [None]:
# Bootstraps - Bootstrapping - One value
bootstrap_scores = []
n = 10000
n_samples = 56
for i in range(n):
    # Extraction of the sample
    indices_sample_grass = np.random.choice(list(range(len(grasses))),
                                      n_samples, replace=True)
    indices_sample_rock = np.random.choice(list(range(len(rocks))),
                                      n_samples, replace=True)
    grass_sample = grasses.iloc[indices_sample_grass]
    rock_sample = rocks.iloc[indices_sample_rock]

    stat, p = ttest_ind(grass_sample["attack"], rock_sample["attack"], axis=0,
                        equal_var=True, nan_policy='propagate')
    # Bootstrap list
    bootstrap_scores.append(p)


# Computation on the whole dataset
bootstrap_scores = np.sort(bootstrap_scores)
mean_bootstrap = np.mean(bootstrap_scores)
lower_bootstrap = (bootstrap_scores - mean_bootstrap)[int(n * 0.025)]
higher_bootstrap = (bootstrap_scores - mean_bootstrap)[int(n * 0.025)]

std = np.std(bootstrap_scores)

yerrs = [[lower_bootstrap, mean_bootstrap - std], [higher_bootstrap, mean_bootstrap + std]]

plt.figure(figsize=(10, 8))
# Can change mean and yerrs to array for several barplots
plt.bar(x = [0, 1], height = mean_bootstrap, yerr=yerrs, color='paleturquoise')
plt.xticks([0, 1], ['95% confidence interval', 'STD'])
plt.xlabel('p-value of the t test between the attack value of grass and rock pokemons')
plt.xlim(-0.5, 1.5)
plt.title('Bootstrapped p-Value')
plt.xlabel('Mean (*) and 95% confidence interval (-)')
plt.ylabel("Mean square error from the bootstrap sampling")
plt.show()

# Graph Networks

### Creation of the graph

In [None]:
# Create graph
G = nx.Graph() # undirected
di_G = nx.DiGraph() # directed

# Add Nodes
G.add_node(1)
G.add_nodes_from(range(2,9))

# Add Edges
G.add_edge(1,2)
edges = [(2,3), (1,3), (4,1), (4,5), (5,6), (5,7), (6,7), (7,8), (6,8)]
G.add_edges_from(edges)

In [None]:
# Add edges from pandas
G = nx.from_pandas_edgelist(edges, 'Source', 'Target',
                                  edge_attr=None, create_using = nx.Graph())

# Add nodes from pandas
# add node attributes by passing dictionary of type name -> attribute
nx.set_node_attributes(G, nodes['attributes'].to_dict(), 'attributes' )

In [None]:
# Random Graph
erG = nx.gnm_random_graph(n, m) # n:nodes - m:edges

### Information about the graph

In [None]:
# Check Nodes
G.nodes()

# Get Information
print(nx.info(G))

In [None]:
# Draw Graph
nx.draw_spring(G, with_labels=True,  alpha = 0.6)

In [None]:
# Degree distrbution
def plot_degree_distribution(G):
    degrees = {}
    for node in G.nodes():
        degree = G.degree(node)
        if degree not in degrees:
            degrees[degree] = 0
        degrees[degree] += 1
    sorted_degree = sorted(degrees.items())
    deg = [k for (k,v) in sorted_degree]
    cnt = [v for (k,v) in sorted_degree]
    fig, ax = plt.subplots()
    plt.bar(deg, cnt, width=0.80, color='plum')
    plt.title("Degree Distribution")
    plt.ylabel("Frequency")
    plt.xlabel("Degree")
    ax.set_xticks([d+0.05 for d in deg])
    ax.set_xticklabels(deg)

# Graph properties
def describe_graph(G):
    print(nx.info(G))
    if nx.is_connected(G):
        print("Avg. Shortest Path Length: %.4f" %nx.average_shortest_path_length(G))
        print("Diameter: %.4f" %nx.diameter(G)) # Longest shortest path
    else:
        print("Graph is not connected")
        print("Diameter and Avg shortest path length are not defined!")
    print("Sparsity: %.4f" %nx.density(G))  # #edges/#edges-complete-graph
    # #closed-triplets(3*#triangles)/#all-triplets
    print("Global clustering coefficient aka Transitivity: %.4f" %nx.transitivity(G))

# Helper function for visualizing the graph
def visualize_graph(G, with_labels=True, k=None, alpha=0.6, node_shape='o'):
    #nx.draw_spring(G, with_labels=with_labels, alpha = alpha)
    plt.figure(figsize=(20, 8))
    pos = nx.spring_layout(G, k=k)
    if with_labels:
        lab = nx.draw_networkx_labels(G, pos, labels=dict([(n, n) for n in G.nodes()]))
    ec = nx.draw_networkx_edges(G, pos, alpha=alpha)
    nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(), node_color='powderblue', node_shape=node_shape)
    plt.axis('off')

In [None]:
# Circular graph
nx.draw_circular(G, with_labels=True,  node_color='powderblue', alpha = 0.8)

In [None]:
# Sparsity
print("Network sparsity: %.4f" %nx.density(G))

In [None]:
# Check if connected graph
print(nx.is_connected(G))

In [None]:
# Number of components
comp = list(nx.connected_components(G))
print('The graph contains', len(comp), 'connected components')

In [None]:
# Largest component
largest_comp = max(comp, key=len)
percentage_lcc = len(largest_comp)/G.number_of_nodes() * 100
print('The largest component has', len(largest_comp), 'nodes',
      'accounting for %.2f'% percentage_lcc, '% of the nodes')

In [None]:
# shortest path
path = nx.shortest_path(G, source="source node", target="target node")

In [None]:
# Longest short path - diameter of the graph
temp_graph = G.subgraph(largest_comp)
print("The diameter of the largest connected component is",
      nx.diameter(G))
print("The avg shortest path length of the largest connected component is",
      nx.average_shortest_path_length(G))

In [None]:
# Ratio of all possible triangles - Transitivity - Triadic closure
print('%.4f' %nx.transitivity(G))

In [None]:
# Clustering coefficient
print(nx.clustering(G, ['node1', 'node2']))

In [None]:
# Subgraph
subgraph_node1 = G.subgraph(['Node 1'] + list(G.neighbors('Node 1')))
nx.draw_spring(G, with_labels=True)
nx.draw_circular(G, with_labels=True)

In [None]:
# Compute degrees and see importance
degrees = dict(G.degree(G.nodes()))
sorted_degree = sorted(degrees.items(), key=itemgetter(1), reverse=True)

# And the top 5 most popular quakers are..
for nodeName, degree in sorted_degree[:5]:
    print(nodeName, 'who is', G.node[nodeName]['Role'], 'knows', degree, 'people')

In [None]:
# Degree Distribution Scatter plot
degree_seq = [d[1] for d in sorted_degree]
degreeCount = collections.Counter(degree_seq)
degreeCount = pd.DataFrame.from_dict( degreeCount, orient='index').reset_index()
fig = plt.figure()
ax = plt.gca()
ax.plot(degreeCount['index'], degreeCount[0], 'o', c='blue', markersize= 4)
plt.ylabel('Frequency')
plt.xlabel('Degree')
plt.title('Degree distribution for the Quaker network')

In [None]:
# Degree of Centrality Katz
degrees = dict(G.degree(G.nodes()))

katz = nx.katz_centrality(G)
nx.set_node_attributes(G, katz, 'katz')
sorted_katz = sorted(katz.items(), key=itemgetter(1), reverse=True)

# And the top 5 most popular quakers are..
for nodeName, katzc in sorted_katz[:5]:
    print(nodeName, 'who is', G.node[nodeName]['Role'], 'has katz-centrality: %.3f' %katzc)

In [None]:
# Compute betweenness centrality
betweenness = nx.betweenness_centrality(G)
# Assign the computed centrality values as a node-attribute in your network
nx.set_node_attributes(G, betweenness, 'betweenness')
sorted_betweenness = sorted(betweenness.items(), key=itemgetter(1), reverse=True)

for nodeName, bw in sorted_betweenness[:5]:
    print(nodeName, 'who is', G.node[nodeName]['Role'], 'has betweeness: %.3f' %bw)

In [None]:
# Degree centrality heatmap
# similar pattern
list_nodes =list(G.nodes())
list_nodes.reverse()   # for showing the nodes with high betweeness centrality
pos = nx.spring_layout(G)
ec = nx.draw_networkx_edges(G, pos, alpha=0.1)
nc = nx.draw_networkx_nodes(G, pos, nodelist=list_nodes,
                            node_color=[G.nodes[n]["betweenness"] for n in list_nodes],
                            with_labels=False, alpha=0.8, node_shape = '.')
plt.colorbar(nc)
plt.axis('off')
plt.show()

### Communities

In [None]:
# Girvan Newman
comp = girvan_newman(G)
it = 0
for communities in itertools.islice(comp, 4):
    it +=1
    print('Iteration', it)
    print(tuple(sorted(c) for c in communities))

In [None]:
# Louvain
partition = community_louvain.best_partition(G)
# add it as an attribute to the nodes
for n in G.nodes:
    G.nodes[n]["louvain"] = partition[n]

In [None]:
pos = nx.spring_layout(G, k=0.2)
ec = nx.draw_networkx_edges(G, pos, alpha=0.2)
nc = nx.draw_networkx_nodes(G, pos, nodelist=G.nodes(),
                            node_color=[G.nodes[n]["louvain"] for n in G.nodes],
                            with_labels=False, node_size=100, cmap=plt.cm.jet)
plt.axis('off')
plt.show()

In [None]:
clusterNode = partition['Node']
# Take all the nodes that belong to James' cluster
members_c = [q for q in G.nodes if partition[q] == clusterNode]
# get info about these quakers
for nodeName in members_c:
    print(nodeName, 'who is', G.node[nodeName]['Role'], 'and died in ',
          G.node[nodeName]['Deathdate'])

In [None]:
# Homophily
# for categorical attributes
nx.attribute_assortativity_coefficient(G, 'Gender')

In [None]:
# Check if edge
G.has_edge(a, b)