# Introduction to Large Data Analysis (2)
Data analysis using pandas & Drawing figures

### 1. Loading a data-file as pandas dataframe<a name="1.1"></a>

In [None]:
"""
＊重要＊
Downloading sample data
"""
!wget -q https://raw.githubusercontent.com/CropEvol/lecture/master/data/mutmap_bulk.txt -O mutmap_bulk.txt
    
#--- Import library ---
import pandas as pd

#--- Loading data from file ---
dataset = 'mutmap_bulk.txt'        # input-file name
df = pd.read_csv(dataset, sep='\t', header=-1, names=['chr', 'pos', 'ref_nucl', 'alt_nucl', 'ref_N', 'alt_N']) 

df  # show

### 2. Accessing to an arbitrary data<a name="1.2"></a>

In [None]:
###### show dataset ######
df


###### Extract one column  ######
#df['ref_nucl']
#df.loc[:, 'ref_nucl']
#df.iloc[:, 2]


###### Extract multi column ######
#df.loc[:, ['ref_nucl','alt_nucl']]
#df.iloc[:, 2:4]


###### Extract one row ######
#df.loc[10,:]
#df.iloc[10,:]


###### Extract multi rows ######
#df.loc[10:15, :]
#df.iloc[10:15, :]


###### Extract one data-cell ######
#df.loc[10, 'ref_nucl']
#df.iloc[10, 2]


###### Extract multi data-cells ######
#df.loc[10:15, ['ref_nucl', 'alt_nucl']]
#df.iloc[10:15, 2:4]


### 3. Calculating SNP-index<a name="1.3"></a>

In [None]:
###### SNP-indexの計算 ######

# !!! Add code !!!
df['snp_index'] = df['alt_N'] / (df['ref_N'] + df['alt_N'])

#--- Show ---
df

### 4. Selecting data by condition<a name="1.4"></a>

In [None]:
###### Selecting data by condition ######

#--- Single condition ---
# df['ref_nucl']=='A'  

# df[ df['ref_nucl']=='A' ] 

#--- Multi conditions ---
# df[ (df['ref_nucl']=='A' ) & (df['alt_nucl']=='G' ) ]    # AND

# df[ (df['ref_nucl']=='A' ) | (df['alt_nucl']=='G' ) ]    # OR


#--- Only data with SNP-index >= 0.9 ---
# !!! Add code !!!
df[ df['snp_index'] >= 0.9 ]


### 5. Writing into a file<a name="1.5"></a>

In [None]:
###### write new table into the output-file ######
#outdata = 'mutmap_snpindex.txt'        # output-file name
#df.to_csv(outdata, sep='\t', header=True, index=False)

### 6. Drawing graph<a name="1.6"></a>

In [None]:
"""
The below line is needed to display the graph in Jupyter Notebook.
This is not a python program. This is a "Magic command" of Jupyter Notebook.
"""
%matplotlib inline


"""
Python program is from here
"""

#--- Import library ---
import matplotlib.pyplot as plt

#--- x-values, y-values ---
df['snp_index'] = df['alt_N'] / (df['ref_N'] + df['alt_N'])
x = df['pos']
y = df['snp_index']

#--- Drawing all data ---
fig = plt.figure(figsize=[16,9])    # graph field
plt.scatter(x, y, color='gray')      # scatter plot
plt.title('SNP-index on chromosome 10', fontsize=24)  # title of this graph
plt.xlabel('Position (x 10 Mb)', fontsize=16)  # label of x-axis
plt.ylabel('SNP-index', fontsize=16)                # label of y-axis


# Drawing the data of "SNP-index >= 0.9"
df_ext = df[ df['snp_index'] >= 0.9 ]

x1 = df_ext['pos']       # x-values
y1 = df_ext['snp_index'] # , y-values

plt.scatter(x1, y1, color='red')      # Scatter plot

### 7. Sliding window analysis<a name="1.7"></a>

In [None]:
###### Sliding Window解析 ######
#---  Import library ---
import numpy as np

#--- Chromosome size, Window size, step size ---
CHROM_SIZE = 23207287       # Length of Chromosome 10　 (bp)
WIN_SIZE       = 1 * 1000 * 1000     #  Window size: 1 Mb = 1000 kb = 1,000,000 bp
STEP_SIZE     = 0.2 * 1000* 1000     #  Step size: 0.2 Mb = 200 kb = 200,000 bp

#--- Prepare lists for the positions and averages of SNP-index in each region ---
win_position  = []  # list for positions
win_snpindex = []  # list for averages of SNP-index

#--- Search all regions---
"""
/// start and end position of each regions ///
start, end
0, 0+1000 (kb)
200, 200+1000
400, 400+1000
  .
  .
  .

/// express by using  WIN_SIZE and STEP_SIZE///
Repeats:　n = 0, 1, 2, ...

start = STEP_SIZE * n  
end = start + WIN_SIZE


If "end > CHROM_SIZE", stop and get out looping.
"""

n = 0 # Repeats
while True:
    
    #--- Start & end position ---
    start = STEP_SIZE * n 
    end   = start + WIN_SIZE
    
    #--- Median of window ---
    p = (start + end) / 2
    win_position.append(p)
    
    #--- Extract data in region ---
    sub = df[(df['pos'] >= start) & (df['pos'] < end)]
    
    #--- Average of SNP-indexes ---
    i = sub['snp_index'].mean()
    win_snpindex.append(i)
        
    #--- Repeats Num +1 ---
    n += 1
    
    #--- stop and get out this looping---
    if end > CHROM_SIZE:
        break

#--- Scatter plot of all data  ---
fig = plt.figure(figsize=[16,9])
plt.scatter(x, y, color='gray')      # all data
plt.title('SNP-index on chromosome 10', fontsize=24)  # title
plt.xlabel('Position (x 10 Mb)', fontsize=16)  # label of x-axis
plt.ylabel('SNP-index', fontsize=16)                # label of y-axis

#--- Scatter plot of SNP-index>=0.9 ---
df_ext = df[ df['snp_index'] >= 0.9 ]
x1 = df_ext['pos']
y1 = df_ext['snp_index']
plt.scatter(x1, y1, color='red')

#--- Line plot of siding window ---
plt.plot(win_position, win_snpindex, color='blue')      

In [None]:
### Extract the window has the averages of SNP-index is more than 0.9. ###

# Dataframe of Sliding window
W = pd.DataFrame({ 'pos': win_position, 'snp_index': win_snpindex})

# start position  & end position of window
W['start'] = W['pos'] - WIN_SIZE / 2 
W['end'] = W['pos'] + WIN_SIZE / 2

# the window of "SNP-index >= 0.9"
W[W['snp_index'] >= 0.9]