#### Declaring text to analyse

In [619]:
import pandas as pd

df = pd.read_fwf('text_data.txt', delimiter='/.', names=["Data"])

df

Unnamed: 0,Data
0,"As a term, data analytics predominantly refers..."
1,"intelligence (BI), reporting and online analyt..."
2,"analytics. In that sense, it's similar in natu..."
3,approaches to analyzing data -- with the diffe...
4,data analytics has a broader focus. The expans...
5,"cases, people use data analytics specifically ..."
6,category. Data analytics initiatives can help ...
7,"efficiency, optimize marketing campaigns and c..."
8,emerging market trends and gain a competitive ...
9,boosting business performance. Depending on th...


#### Probability of the word “data” occurring in each line

In [620]:
data = df.loc[df['Data'].str.contains('data')]
nodata = df.loc[~df['Data'].str.contains('data')]

total_lines = df.loc[0:].count().item()

probability_data = (data.count().item() / total_lines)
probability_no_data = (nodata.count().item() / total_lines)

print('Probabilty of word','data','occuring in a line:',probability_data)
print('Probabilty of word','data','not occuring in a line:',probability_no_data)


Probabilty of word data occuring in a line: 0.5454545454545454
Probabilty of word data not occuring in a line: 0.45454545454545453


#### Distribution of distinct word counts across all the lines 

In [621]:
from collections import Counter

## Distribution of distinct word across all lines

df.Data.apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0)

## Distinct words in each line  ⇊⇊⇊ 0:21

# distinct = Counter(" ".join(df.loc[21].values.tolist()).split(" ")).items()
# distinct

an                 1.0
refers             1.0
business           4.0
basic              1.0
a                 10.0
term,              1.0
assortment         1.0
of                10.0
predominantly      1.0
analytics          7.0
applications,      1.0
from               2.0
data              14.0
to                11.0
As                 1.0
(BI),              1.0
(OLAP)             1.0
intelligence       1.0
and                9.0
online             1.0
forms              1.0
various            1.0
advanced           2.0
reporting          1.0
analytical         1.0
processing         1.0
that               4.0
another            1.0
umbrella           1.0
nature             1.0
                  ... 
be                 2.0
qualitative        2.0
separated          1.0
also               1.0
quantitative       1.0
analysis.          1.0
into               1.0
variables          1.0
involves           1.0
quantifiable       1.0
numerical          1.0
former             1.0
measured   

#### Probability of the word “analytics” occurring after the word “data”

In [622]:
import re

analytics_occurrences = df.Data.str.count('analytics[(\s.,)]', flags=re.I).sum()
print("Occurrences of \"analytics\" occur:" ,analytics_occurrences)

data_analytics_occurrences = df.loc[df['Data'].str.contains('data\sanalytics', flags=re.I, regex=True)].count().item()
print("Occurrences of \"analytics\" occurs after \"data\":" ,data_analytics_occurrences ,"\n")

prob_data_analytics = data_analytics_occurrences / analytics_occurrences
print("Probability of \"analytics\" occurs after \"data\":",prob_data_analytics)


Occurrences of "analytics" occur: 10
Occurrences of "analytics" occurs after "data": 5 

Probability of "analytics" occurs after "data": 0.5
