# my notes on reading linux kernel 0.97 codes

## net/unix.c

```c
/*
 * buffer size must be power of 2. buffer mgmt inspired by pipe code.
 * note that buffer contents can wraparound, and we can write one byte less
 * than full size to discern full vs empty.
 */
#define BUF_SIZE PAGE_SIZE
#define UN_BUF_AVAIL(UPD) (((UPD)->bp_head - (UPD)->bp_tail) & (BUF_SIZE-1))
#define UN_BUF_SPACE(UPD) ((BUF_SIZE-1) - UN_BUF_AVAIL(UPD))
```

This is a very classic buffer implement.

* if `bp_head == bp_tail`, the buffer is empty

* `UN_BUF_AVAIL` is the data size for reading in buffer. `UN_BUF_SPACE` is the empty space size for writing in buffer.

* `&(BUF_SIZE-1)` is equal to `% BUF_SIZE` because of the BUF_SIZE is power of 2.

* When writing to buffer, the `bp_head` move ahead, when reading from the buffer, the `bp_tail` move ahead.

* When `bp_head` come across the upper boundary(BUF_SIZE), it wraparounds and starts from the beginning: `pupd->bp_head = (pupd->bp_head + cando) & (BUF_SIZE-1);` In this case, the `bp_head` is smaller than the `bp_tail` and `(bp_head - bp_tail) & (BUF_SIZE-1)` is equal to `bp_head + BUF_SIZE - bp_tail`, which is actually the available size. 

NICE IMPLEMENTATION !

------------------------

```c
/*
 * we write to our peer's buf. when we connected we ref'd this peer so we
 * are safe that the buffer remains, even after the peer has disconnected,
 * which we check other ways.
 */
static int
unix_proto_write(struct socket *sock, char *ubuf, int size, int nonblock)
{
	struct unix_proto_data *pupd;
	int todo, space;

	/* xitongsys:
	using assign statement value
	*/
	if ((todo = size) <= 0)
		return 0;
	if (sock->state != SS_CONNECTED) {
		PRINTK("unix_proto_write: socket not connected\n");
		if (sock->state == SS_DISCONNECTING) {
			send_sig(SIGPIPE,current,1);
			return -EINTR;
		}
		return -EINVAL;
	}
	pupd = UN_DATA(sock)->peerupd;	/* safer than sock->conn */

	while (!(space = UN_BUF_SPACE(pupd))) {
		PRINTK("unix_proto_write: no space left...\n");
		if (nonblock)
			return -EAGAIN;
		interruptible_sleep_on(sock->wait);

		*/
		if (current->signal & ~current->blocked) {
			PRINTK("unix_proto_write: interrupted\n");
			return -ERESTARTSYS;
		}
		if (sock->state == SS_DISCONNECTING) {
			PRINTK("unix_proto_write: disconnected (SIGPIPE)\n");
			send_sig(SIGPIPE,current,1);
			return -EINTR;
		}
	}

	/*
	 * copy from the user's buffer to the write buffer, watching for
	 * wraparound. then we wake up the reader
	 */
	do {
		int part, cando;

		if (space <= 0) {
			PRINTK("unix_proto_write: SPACE IS NEGATIVE!!!\n");
			send_sig(SIGKILL,current,1);
			return -EINTR;
		}

		/*
		 * we may become disconnected inside this loop, so watch
		 * for it (peerupd is safe until we close)
		 */
		if (sock->state == SS_DISCONNECTING) {
			send_sig(SIGPIPE,current,1);
			return -EINTR;
		}
		if ((cando = todo) > space)
			cando = space;
		if (cando > (part = BUF_SIZE - pupd->bp_head))
			cando = part;
		PRINTK("unix_proto_write: space=%d, todo=%d, cando=%d\n",
		       space, todo, cando);
		verify_area(ubuf, cando);
		memcpy_fromfs(pupd->buf + pupd->bp_head, ubuf, cando);
		pupd->bp_head = (pupd->bp_head + cando) & (BUF_SIZE-1);
		ubuf += cando;
		todo -= cando;
		if (sock->state == SS_CONNECTED)
			wake_up(sock->conn->wait);
		space = UN_BUF_SPACE(pupd);
	} while (todo && space);
	return size - todo;
}
```

* `if ((todo = size) <= 0)` good style.

* EINTR: Many system calls will report the EINTR error code if a signal occurred while the system call was in progress. No error actually occurred, it's just reported that way because the system isn't able to resume the system call automatically. This coding pattern simply retries the system call when this happens, to ignore the interrupt.

* ERESTARTSYS: -ERESTARTSYS is connected to the concept of a restartable system call. A restartable system call is one that can be transparently re-executed by the kernel when there is some interruption.
For instance the user space process which is sleeping in a system call can get a signal, execute a handler, and then when the handler returns, it appears to go back into the kernel and keeps sleeping on the original system call.
Using the POSIX sigaction API's SA_RESTART flag, processes can arrange the restart behavior associated with signals.
In the Linux kernel, when a driver or other module blocking in the context of a system call detects that a task has been woken because of a signal, it can return -EINTR. But -EINTR will bubble up to user space and cause the system call to return -1 with errno set to EINTR.
[ERESTATSYS](https://stackoverflow.com/questions/9576604/what-does-erestartsys-used-while-writing-linux-driver)

* EINVAL: invalid argument

* 

```c
if (sock->state == SS_CONNECTED)
	wake_up(sock->conn->wait);
```

wake up the waiting task to read from the buffer.

-------------


## lib/malloc.c

```c
struct bucket_desc {	/* 16 bytes */
	void			*page;
	struct bucket_desc	*next;
	void			*freeptr;
	unsigned short		refcnt;
	unsigned short		bucket_size;
};

struct _bucket_dir {	/* 8 bytes */
	int			size;
	struct bucket_desc	*chain;
};

/*
 * The following is the where we store a pointer to the first bucket
 * descriptor for a given size.  
 *
 * If it turns out that the Linux kernel allocates a lot of objects of a
 * specific size, then we may want to add that specific size to this list,
 * since that will allow the memory to be allocated more efficiently.
 * However, since an entire page must be dedicated to each specific size
 * on this list, some amount of temperance must be exercised here.
 *
 * Note that this list *must* be kept in order.
 */
struct _bucket_dir bucket_dir[] = {
	{ 16,	(struct bucket_desc *) 0},
	{ 32,	(struct bucket_desc *) 0},
	{ 64,	(struct bucket_desc *) 0},
	{ 128,	(struct bucket_desc *) 0},
	{ 256,	(struct bucket_desc *) 0},
	{ 512,	(struct bucket_desc *) 0},
	{ 1024,	(struct bucket_desc *) 0},
	{ 2048, (struct bucket_desc *) 0},
	{ 4096, (struct bucket_desc *) 0},
	{ 0,    (struct bucket_desc *) 0}};   /* End of list marker */
```

* `{ 0,    (struct bucket_desc *) 0}};   /* End of list marker */` nice trick

* every `bucket_desc` points to a page and the `refcnt` is the reference count of the page. `freeptr` point to one free object on the page.

* `_bucket_dir.chain` is a list of pages

----------------


![malloc01.png](imgs/malloc01.png)
```c
void *malloc(unsigned int len)
{
	struct _bucket_dir	*bdir;
	struct bucket_desc	*bdesc;
	void			*retval;

	/*
	 * First we search the bucket_dir to find the right bucket change
	 * for this request.
	 */
	for (bdir = bucket_dir; bdir->size; bdir++)
		if (bdir->size >= len)
			break;
	if (!bdir->size) {
		printk("malloc called with impossibly large argument (%d)\n",
			len);
		panic("malloc: bad arg");
	}
	/*
	 * Now we search for a bucket descriptor which has free space
	 */
	cli();	/* Avoid race conditions */
	for (bdesc = bdir->chain; bdesc; bdesc = bdesc->next) 
		if (bdesc->freeptr)
			break;
	/*
	 * If we didn't find a bucket with free space, then we'll 
	 * allocate a new one.
	 */
	if (!bdesc) {
		char		*cp;
		int		i;

		if (!free_bucket_desc)	
			init_bucket_desc();
		bdesc = free_bucket_desc;
		free_bucket_desc = bdesc->next;
		bdesc->refcnt = 0;
		bdesc->bucket_size = bdir->size;
		bdesc->page = bdesc->freeptr = (void *) cp = get_free_page(GFP_KERNEL);
		if (!cp)
			panic("Out of memory in kernel malloc()");
		/* Set up the chain of free objects */
		for (i=PAGE_SIZE/bdir->size; i > 1; i--) {
			*((char **) cp) = cp + bdir->size;
			cp += bdir->size;
		}
		*((char **) cp) = 0;
		bdesc->next = bdir->chain; /* OK, link it in! */
		bdir->chain = bdesc;
	}
	retval = (void *) bdesc->freeptr;
	bdesc->freeptr = *((void **) retval);
	bdesc->refcnt++;
	sti();	/* OK, we're safe again */
	return(retval);
}
```
* This code is so beautiful and has some tricks !
```c
bdesc->page = bdesc->freeptr = (void *) cp = get_free_page(GFP_KERNEL);
if (!cp)
	panic("Out of memory in kernel malloc()");
/* Set up the chain of free objects */
for (i=PAGE_SIZE/bdir->size; i > 1; i--) {
	*((char **) cp) = cp + bdir->size;
	cp += bdir->size;
}
*((char **) cp) = 0;
```

1. If there is not a free object, use `get_free_page` to get a page and align number of `PAGE_SIZE/bdir->size` object on the page. 

2. For every object in the page, the first 4 bytes is a pointer which points to the next free object. This is done by `*((char **) cp) = cp + bdir->size;` NICE CODE!!!

3. `*((char **) cp) = 0;` the last item of the list points to NULL. Tricky code !!!

4. Alought we use the first 4 bytes of every object as a link list pointer, it's no useful when we retrive it. So no 4 bytes wasting!!! NICE !

*

```c
retval = (void *) bdesc->freeptr;
bdesc->freeptr = *((void **) retval);
bdesc->refcnt++;
```

1. retval is the got object address

2. `bdesc->freeptr = *((void **) retval);` set the freeptr to point to next object.
-----------

## mm/swap.c

```c
#define SWAP_BITS (4096<<3)
```

* The first page is used as bitmap. totally has 4K * 8 bits.

```c
/*
 * We never page the pages in task[0] - kernel memory.
 * We page all other pages.
 */
#define FIRST_VM_PAGE (TASK_SIZE>>12)
#define LAST_VM_PAGE (1024*1024)
#define VM_PAGES (LAST_VM_PAGE - FIRST_VM_PAGE)

```
* In current kernel, the linear memory address of every task are not overlap. Every task has 64MB linear address.

* `TASK_SIZE = 64MB` So `FIRST_VM_PAGE=64MB/4KB` is the last address of the first task.

* `LAST_VM_PATH = 4GB/4KB` 

-----------

```c
static unsigned int get_swap_page(void)
{
	unsigned int nr;

	if (!swap_bitmap)
		return 0;
	for (nr = lowest_bit; nr <= highest_bit ; nr++)
		if (clrbit(swap_bitmap,nr)) {
			if (nr == highest_bit)
				highest_bit--;
			return lowest_bit = nr;
		}
	return 0;
}

void swap_free(unsigned int swap_nr)
{
	if (!swap_nr)
		return;
	if (swap_bitmap && swap_nr < SWAP_BITS) {
		if (swap_nr < lowest_bit)
			lowest_bit = swap_nr;
		if (swap_nr > highest_bit)
			highest_bit = swap_nr;
		if (!setbit(swap_bitmap,swap_nr))
			return;
	}
	printk("swap_free: swap-space bitmap bad (bit %d)\n",swap_nr);
	return;
}
```

* `lowest_bit` is the lowest bit which is 1(free) and `highest_bit` is the hightest bit which is 1(free). If all bits are 0(occupied), `lowest_bit > highest_bit`

* For frequenctly swap in/out process, this method is much faster than iterator all bits every time, which is used in 0.12 kernel codes.

* NICE IMPROVEMENT !

------------



```c
void swap_in(unsigned long *table_ptr)
{
	unsigned long swap_nr;
	unsigned long page;

	swap_nr = *table_ptr;
	if (1 & swap_nr) {
		printk("trying to swap in present page\n\r");
		return;
	}
	if (!swap_nr) {
		printk("No swap page in swap_in\n\r");
		return;
	}
	if (!swap_bitmap) {
		printk("Trying to swap in without swap bit-map");
		*table_ptr = BAD_PAGE;
		return;
	}
	page = get_free_page(GFP_KERNEL);
	if (!page) {
		oom(current);
		page = BAD_PAGE;
	} else	
		read_swap_page(swap_nr>>1, (char *) page);
	if (*table_ptr != swap_nr) {
		free_page(page);
		return;
	}
	swap_free(swap_nr>>1);
	*table_ptr = page | (PAGE_DIRTY | 7);
}
```

* read some page from SWAP_DEV to memory page. `table_ptr` is the page table pointer.

* For the page swaped in to SWAP_DEV, the entry in page table stores the `swap_nr*2 = (swap_nr<<1)`.

* For page table entry, the first bit is `Present` flag. If `Present=0`, the other 31 bits are free to used.

![swap01](imgs/swap01.png)

----------

```c
int try_to_swap_out(unsigned long * table_ptr)
{
	int i;
	unsigned long page;
	unsigned long swap_nr;

	page = *table_ptr;
	if (!(PAGE_PRESENT & page))
		return 0;
	if (page < low_memory || page >= high_memory)
		return 0;
	for (i = 0; i < NR_LAST_FREE_PAGES; i++)
		if (last_free_pages[i] == (page & 0xfffff000))
			return 0;
	if (PAGE_DIRTY & page) {
		page &= 0xfffff000;
		if (mem_map[MAP_NR(page)] != 1)
			return 0;
		if (!(swap_nr = get_swap_page()))
			return 0;
		*table_ptr = swap_nr<<1;
		invalidate();
		write_swap_page(swap_nr, (char *) page);
		free_page(page);
		return 1;
	}
	page &= 0xfffff000;
	*table_ptr = 0;
	invalidate();
	free_page(page);
	return 1;
}
```

----------

```c
/*
 * Go through the page tables, searching for a user page that
 * we can swap out.
 *
 * We now check that the process is swappable (normally only 'init'
 * is un-swappable), allowing high-priority processes which cannot be
 * swapped out (things like user-level device drivers (Not implemented)).
 */
int swap_out(void)
{
	static int dir_entry = 1024;
	static int page_entry = -1;
	int counter = VM_PAGES;
	int pg_table;
	struct task_struct * p;

check_dir:
	if (counter < 0)
		goto no_swap;
	if (dir_entry >= 1024)
		dir_entry = FIRST_VM_PAGE>>10;
	if (!(p = task[dir_entry >> 4])) {
		counter -= 1024;
		dir_entry++;
		goto check_dir;
	}
	if (!(1 & (pg_table = pg_dir[dir_entry]))) {
		if (pg_table) {
			printk("bad page-table at pg_dir[%d]: %08x\n\r",
				dir_entry,pg_table);
			pg_dir[dir_entry] = 0;
		}
		counter -= 1024;
		dir_entry++;
		goto check_dir;
	}
	pg_table &= 0xfffff000;
check_table:
	if (counter < 0)
		goto no_swap;
	counter--;
	page_entry++;
	if (page_entry >= 1024) {
		page_entry = -1;
		dir_entry++;
		goto check_dir;
	}
	if (p->swappable && try_to_swap_out(page_entry + (unsigned long *) pg_table)) {
		p->rss--;
		dir_entry++;
		return 1;
	}
	goto check_table;
no_swap:
	printk("Out of swap-memory\n\r");
	return 0;
}
```

* This styel(goto for loop) is to make good and fast machine code

* This function search the (4G - 64MB) linear address and find the page can be swaped.

* 
```c
	static int dir_entry = 1024;
	static int page_entry = -1;
```

static variables are used for next search position.

*
```c
	if (counter < 0)
		goto no_swap;
	if (dir_entry >= 1024)
		dir_entry = FIRST_VM_PAGE>>10;
	if (!(p = task[dir_entry >> 4])) {
		counter -= 1024;
		dir_entry++;
		goto check_dir;
	}
	if (!(1 & (pg_table = pg_dir[dir_entry]))) {
		if (pg_table) {
			printk("bad page-table at pg_dir[%d]: %08x\n\r",
				dir_entry,pg_table);
			pg_dir[dir_entry] = 0;
		}
		counter -= 1024;
		dir_entry++;
		goto check_dir;
	}
	pg_table &= 0xfffff000;
```

1. Every dir_entry has 1024 * 4KB = 4MB address and every task has 64MB address. So every task needs 16 page directory entries. So  `task id = dir_entry / 16 = dir_entry >> 4`


2. `(1 & (pg_table = pg_dir[dir_entry])` the first bit is present flag.


* 
```c
check_table:
	if (counter < 0)
		goto no_swap;
	counter--;
	page_entry++;
	if (page_entry >= 1024) {
		page_entry = -1;
		dir_entry++;
		goto check_dir;
	}
	if (p->swappable && try_to_swap_out(page_entry + (unsigned long *) pg_table)) {
		p->rss--;
		dir_entry++;
		return 1;
	}
	goto check_table;
```

1. check every page in page table.

2. `rss` number of resident pages
----------


```c
__asm__("std ; repne ; scasb\n\t"
    "jne 1f\n\t"
    "movb $1,1(%%edi)\n\t" // 1 -> [1 + edi], set the count number of this page to 1
    "sall $12,%%ecx\n\t" // page_count * 4K = page_address
    "addl %2,%%ecx\n\t" // low_memory + page_address = real address. Because the memory below low_memory is used for kernel permanently.
    "movl %%ecx,%%edx\n\t" // move the real address from ecx to edx
    "movl $1024,%%ecx\n\t" // 1024 -> ecx
    "leal 4092(%%edx),%%edi\n\t" // edx is the start address, and the end address = edx + 4096 - 4 = 4092 + edx. In the next instruction, we will set zero by stosl, which operates 4bytes one time. So 4096 - 4  + edx is the start address(the last long variable)
    "rep ; stosl\n\t" // rep 1024 times to set 0 to this page
    "movl %%edx,%%eax\n" // real address to eax(return value)
    "1:\tcld"
    :"=a" (result)
    :"0" (0),"b" (low_memory),"c" (paging_pages),
    "D" (mem_map+paging_pages-1)
    :"di","cx","dx");
```

* std: Sets the DF flag in the EFLAGS register. When the DF flag is set to 1, string operations decrement the index registers (ESI and/or EDI). Operation is the same in all modes.

* repne: Repeats a string instruction the number of times specified in the count register or until the indicated condition of the ZF flag is no longer met. The REP (repeat), REPE (repeat while equal), REPNE (repeat while not equal), REPZ (repeat while zero), and REPNZ (repeat while not zero)

* scasb: The no-operands form of the instruction uses a short form of SCAS. Again, ES:(E)DI is assumed to be the memory operand and AL, AX, or EAX is assumed to be the register operand. The size of operands is selected by the mnemonic: SCASB (byte comparison), SCASW (word comparison), or SCASD (doubleword comparison).

* mem_map is the physical page used count array. Element is the used count of that page and is a int8 variable.

--------

## mm/mmap.c

```c
caddr_t
sys_mmap(unsigned long *buffer)
{
	unsigned long base, addr;
	unsigned long len, limit, off;
	int prot, flags, fd;
	struct file *file;
	struct inode *inode;

	addr = (unsigned long)	get_fs_long(buffer);	/* user address space*/
	len = (size_t)		get_fs_long(buffer+1);	/* nbytes of mapping */
	prot = (int)		get_fs_long(buffer+2);	/* protection */
	flags = (int)		get_fs_long(buffer+3);	/* mapping type */
	fd = (int) 		get_fs_long(buffer+4);	/* object to map */
	off = (unsigned long)	get_fs_long(buffer+5);	/* offset in object */

	if (fd >= NR_OPEN || fd < 0 || !(file = current->filp[fd]))
		return (caddr_t) -EBADF;
	if (addr > TASK_SIZE || (addr+(unsigned long) len) > TASK_SIZE)
		return (caddr_t) -EINVAL;
	inode = file->f_inode;
```

* buffer is a struct defined the mapping details: addr, len, prot, flags, fd, offset

----------------

## kernel/blk_drv/blk.h

```c
/*
 * This is used in the elevator algorithm: Note that
 * reads always go before writes. This is natural: reads
 * are much more time-critical than writes.
 */
#define IN_ORDER(s1,s2) \
((s1)->cmd < (s2)->cmd || ((s1)->cmd == (s2)->cmd && \
((s1)->dev < (s2)->dev || (((s1)->dev == (s2)->dev && \
(s1)->sector < (s2)->sector)))))

```

* sort the request by (cmd, dev, sector) incresingly.

* elevator algorithm: When a new request arrives while the drive is idle, the initial arm/head movement will be in the direction of the cylinder where the data is stored, either in or out. As additional requests arrive, requests are serviced only in the current direction of arm movement until the arm reaches the edge of the disk. When this happens, the direction of the arm reverses, and the requests that were remaining in the opposite direction are serviced, and so on

------------


## include/linux/a.out.h

```c
struct exec
{
  unsigned long a_info;		/* Use macros N_MAGIC, etc for access */
  unsigned a_text;		/* length of text, in bytes */
  unsigned a_data;		/* length of data, in bytes */
  unsigned a_bss;		/* length of uninitialized data area for file, in bytes */
  unsigned a_syms;		/* length of symbol table data in file, in bytes */
  unsigned a_entry;		/* start address */
  unsigned a_trsize;		/* length of relocation info for text, in bytes */
  unsigned a_drsize;		/* length of relocation info for data, in bytes */
};
```

![a.out01.png](imgs/a.out01.png)

* exec header structure

```c
#define _N_HDROFF(x) (1024 - sizeof (struct exec))

#if !defined (N_TXTOFF)
#define N_TXTOFF(x) \
 (N_MAGIC(x) == ZMAGIC ? _N_HDROFF((x)) + sizeof (struct exec) : sizeof (struct exec))
#endif
```

* For ZMAGIC, the header used first 1K space.


```c
#if !defined (N_DATOFF)
#define N_DATOFF(x) (N_TXTOFF(x) + (x).a_text)
#endif

#if !defined (N_TRELOFF)
#define N_TRELOFF(x) (N_DATOFF(x) + (x).a_data)
#endif

#if !defined (N_DRELOFF)
#define N_DRELOFF(x) (N_TRELOFF(x) + (x).a_trsize)
#endif

#if !defined (N_SYMOFF)
#define N_SYMOFF(x) (N_DRELOFF(x) + (x).a_drsize)
#endif

#if !defined (N_STROFF)
#define N_STROFF(x) (N_SYMOFF(x) + (x).a_syms)
#endif

/* Address of text segment in memory after it is loaded.  */
#if !defined (N_TXTADDR)
#define N_TXTADDR(x) 0
#endif
```

* offsets macros

* text segment in memory is start 0 address

```c
#ifdef linux
#define PAGE_SIZE	4096
#define SEGMENT_SIZE	1024
#endif

#define _N_SEGMENT_ROUND(x) (((x) + SEGMENT_SIZE - 1) & ~(SEGMENT_SIZE - 1))
```

* Nice method to implement (x + segment_size - 1) / segment_size. segment_size must be a power of 2


```c
/* Address of bss segment in memory after it is loaded.  */
#if !defined (N_BSSADDR)
#define N_BSSADDR(x) (N_DATADDR(x) + (x).a_data)
#endif
```

* BSS segment is after the data segment

-----------------------

## include/linux/ctype.h

```c
#define _U	0x01	/* upper */
#define _L	0x02	/* lower */
#define _D	0x04	/* digit */
#define _C	0x08	/* cntrl */
#define _P	0x10	/* punct */
#define _S	0x20	/* white space (space/lf/tab) */
#define _X	0x40	/* hex digit */
#define _SP	0x80	/* hard space (0x20) */

extern unsigned char _ctype[];
extern char _ctmp;

#define isalnum(c) ((_ctype+1)[c]&(_U|_L|_D))
#define isalpha(c) ((_ctype+1)[c]&(_U|_L))
#define iscntrl(c) ((_ctype+1)[c]&(_C))
#define isdigit(c) ((_ctype+1)[c]&(_D))
#define isgraph(c) ((_ctype+1)[c]&(_P|_U|_L|_D))
#define islower(c) ((_ctype+1)[c]&(_L))
#define isprint(c) ((_ctype+1)[c]&(_P|_U|_L|_D|_SP))
#define ispunct(c) ((_ctype+1)[c]&(_P))
#define isspace(c) ((_ctype+1)[c]&(_S))
#define isupper(c) ((_ctype+1)[c]&(_U))
#define isxdigit(c) ((_ctype+1)[c]&(_D|_X))

#define isascii(c) (((unsigned) c)<=0x7f)
#define toascii(c) (((unsigned) c)&0x7f)

#define tolower(c) (_ctmp=c,isupper(_ctmp)?_ctmp-('A'-'a'):_ctmp)
#define toupper(c) (_ctmp=c,islower(_ctmp)?_ctmp-('a'-'A'):_ctmp)
```

* char type macros. Nice method.

------------

## include/linux/stddef.h

```c
#ifndef _SIZE_T
#define _SIZE_T
typedef unsigned int size_t;
#endif

#undef NULL
#define NULL ((void *)0)

#undef offsetof
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
```

* NICE METHOD !!! `(TYPE *)0` convert 0 to the `TYPE*` which the struct base address is 0, then get the `MEMBER` address which is the offset !

---------

## include/linux/string.h
## lib/string.c

`string.h` defines several `extern inline` functions

```c
extern inline char * strcpy(char * dest,const char *src)
```


`string.c` define extern and inline to empty and then include the `string.h`. 

```c
#include <linux/types.h>

#define extern
#define inline
#define __LIBRARY__
#include <linux/string.h>

```

NICE TRICK !!!!


![string01.png](imgs/string01.png)

-------------------------


```c
extern inline void * memcpy(void * dest,const void * src, size_t n)
{
__asm__("cld\n\t"
	"rep\n\t"
	"movsb"
	::"c" (n),"S" (src),"D" (dest)
	:"cx","si","di");
return dest;
}

extern inline void * memmove(void * dest,const void * src, size_t n)
{
if (dest<src)
__asm__("cld\n\t"
	"rep\n\t"
	"movsb"
	::"c" (n),"S" (src),"D" (dest)
	:"cx","si","di");
else
__asm__("std\n\t"
	"rep\n\t"
	"movsb\n\t"
	"cld"
	::"c" (n),"S" (src+n-1),"D" (dest+n-1)
	:"cx","si","di");
return dest;
}
```

* `memcpy` just copy from src to dst

* `memmove` can handle src and dst have some overlaps. If dst < src, copy from head to end, else copy from end to head. NICE.

----------

## include/linux/unistd.h

```c
#define _syscall0(type,name) 
#define _syscall1(type,name,atype,a)
#define _syscall2(type,name,atype,a,btype,b)
#define _syscall3(type,name,atype,a,btype,b,ctype,c)
#define _syscall4(type,name,atype,a,btype,b,ctype,c,dtype,d)
#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e)
```

* define 5 syscall

```c
#define _syscall5(type,name,atype,a,btype,b,ctype,c,dtype,d,etype,e) \
type name (atype a,btype b,ctype c,dtype d,etype e) \
{ \
long __res; \
__asm__ volatile ("movl %2,%%ebx\n\t" \
	"int $0x80" \
	: "=a" (__res) \
	: "0" (__NR_##name),"b" ((long)(a)),"c" ((long)(b)), \
	  "d" ((long)(c)),"S" ((long)(d)),"D" ((long)(e))); \
if (__res>=0) \
	return (type) __res; \
errno=-__res; \
return -1; \
}
```

* a, b, c, S, D 5 registers for arguements and return value in register a.

--------

## include/asm/io.h

```c
extern void inline outb(char value, unsigned short port)
{
__asm__ __volatile__ ("outb %0,%1"
		::"a" ((char) value),"d" ((unsigned short) port));
}

extern unsigned char inline inb(unsigned short port)
{
	unsigned char _v;
__asm__ __volatile__ ("inb %1,%0"
		:"=a" (_v):"d" ((unsigned short) port));
	return _v;
}

```

* output/input a char to a port


```c
/*
 * Thanks to James van Artsdalen for a better timing-fix than
 * the two short jumps: using outb's to a nonexistent port seems
 * to guarantee better timings even on fast machines.
 *
 *		Linus
 */

extern void inline outb_p(char value, unsigned short port)
{
__asm__ __volatile__ ("outb %0,%1\n\t"
#ifdef REALLY_SLOW_IO
		  "outb %0,$0x80\n\t"
		  "outb %0,$0x80\n\t"
		  "outb %0,$0x80\n\t"
#endif
		  "outb %0,$0x80"
		::"a" ((char) value),"d" ((unsigned short) port));
}
```

* multiple outb to a noexistent port to delay.

-----------------

## linux/asm/segment.h

![segment01.png](imgs/segment01.png)

```c
extern inline unsigned char get_fs_byte(const char * addr)
{
	unsigned register char _v;

	__asm__ ("movb %%fs:%1,%0":"=q" (_v):"m" (*addr));
	return _v;
}
```

* return char at `fs:[addr]` 

---------------

```c
extern inline unsigned long get_fs() 
{
	unsigned short _v;
	__asm__("mov %%fs,%0":"=r" (_v):);
	return _v;
}

extern inline unsigned long get_ds() 
{
	unsigned short _v;
	__asm__("mov %%ds,%0":"=r" (_v):);
	return _v;
}

extern inline void set_fs(unsigned long val)
{
	__asm__ __volatile__("mov %0,%%fs"::"r" ((unsigned short) val));
}
```

------------

## include/linux/resources.h

```c
struct	rusage {
	struct timeval ru_utime;	/* user time used */
	struct timeval ru_stime;	/* system time used */
	long	ru_maxrss;		/* maximum resident set size */
	long	ru_ixrss;		/* integral shared memory size */
	long	ru_idrss;		/* integral unshared data size */
	long	ru_isrss;		/* integral unshared stack size */
	long	ru_minflt;		/* page reclaims */
	long	ru_majflt;		/* page faults */
	long	ru_nswap;		/* swaps */
	long	ru_inblock;		/* block input operations */
	long	ru_oublock;		/* block output operations */
	long	ru_msgsnd;		/* messages sent */
	long	ru_msgrcv;		/* messages received */
	long	ru_nsignals;		/* signals received */
	long	ru_nvcsw;		/* voluntary context switches */
	long	ru_nivcsw;		/* involuntary " */
};
```

```c
/*
 * Resource limits
 */
#define RLIMIT_CPU	0		/* CPU time in ms */
#define RLIMIT_FSIZE	1		/* Maximum filesize */
#define RLIMIT_DATA	2		/* max data size */
#define RLIMIT_STACK	3		/* max stack size */
#define RLIMIT_CORE	4		/* max core file size */
#define RLIMIT_RSS	5		/* max resident set size */

#ifdef notdef
#define RLIMIT_MEMLOCK	6		/* max locked-in-memory address space*/
#define RLIMIT_NPROC	7		/* max number of processes */
#define RLIMIT_OFILE	8		/* max number of open files */
#endif

#define RLIM_NLIMITS	6

#define RLIM_INFINITY	0x7fffffff

struct rlimit {
	int	rlim_cur;
	int	rlim_max;
};

```

* several limits. There is a rlimi[] array in the task structure.

![resources01.png](imgs/resource01.png)

--------------

## include/linux/time.h

```c
struct timeval {
	long	tv_sec;		/* seconds */
	long	tv_usec;	/* microseconds */
};

struct timezone {
	int	tz_minuteswest;	/* minutes west of Greenwich */
	int	tz_dsttime;	/* type of dst correction */
};

/*
 * Names of the interval timers, and structure
 * defining a timer setting.
 */
#define	ITIMER_REAL	0
#define	ITIMER_VIRTUAL	1
#define	ITIMER_PROF	2

struct	itimerval {
	struct	timeval it_interval;	/* timer interval */
	struct	timeval it_value;	/* current value */
};
```
* several time structures

* `itimerval` it is Internal Timer

```c
#define FD_SETSIZE		(8*sizeof(fd_set))
#define FD_SET(fd,fdsetp)	(*(fdsetp) |= (1 << (fd)))
#define FD_CLR(fd,fdsetp)	(*(fdsetp) &= ~(1 << (fd)))
#define FD_ISSET(fd,fdsetp)	((*(fdsetp) >> fd) & 1)
#define FD_ZERO(fdsetp)		(*(fdsetp) = 0)
```

* several fd operation macros

-----------

## include/linux/times.h

```c
struct tms {
	time_t tms_utime;  // used cpu time
	time_t tms_stime;  // used kernel time
	time_t tms_cutime; // killed child processes used cpu time
	time_t tms_cstime; // killed child processes used kernel time
};
```

-------------

## kernel

![interupt02.png](imgs/interupt02.png)
![interupt01.png](imgs/interupt01.png)

* EIP 指令地址寄存器

* ESP 栈顶地址寄存器

* SS stack segment register

* CS cose segment register

* DS data segment register

![interupt03.png](imgs/interupt03.png)

---------

## kernel/mktime.c

```c
#define MINUTE 60
#define HOUR (60*MINUTE)
#define DAY (24*HOUR)
#define YEAR (365*DAY)

/* 每个月开始时候的秒数 */

/* interestingly, we assume leap-years */
static int month[12] = {
	0,
	DAY*(31),
	DAY*(31+29),
	DAY*(31+29+31),
	DAY*(31+29+31+30),
	DAY*(31+29+31+30+31),
	DAY*(31+29+31+30+31+30),
	DAY*(31+29+31+30+31+30+31),
	DAY*(31+29+31+30+31+30+31+31),
	DAY*(31+29+31+30+31+30+31+31+30),
	DAY*(31+29+31+30+31+30+31+31+30+31),
	DAY*(31+29+31+30+31+30+31+31+30+31+30)
};

long kernel_mktime(struct tm * tm)
{
	long res;
	int year;

	year = tm->tm_year - 70;
/* magic offsets (y+1) needed to get leapyears right.*/
	res = YEAR*year + DAY*((year+1)/4);
	res += month[tm->tm_mon];
/* and (y+2) here. If it wasn't a leap-year, we have to adjust */
	if (tm->tm_mon>1 && ((year+2)%4))
		res -= DAY;
	res += DAY*(tm->tm_mday-1);
	res += HOUR*tm->tm_hour;
	res += MINUTE*tm->tm_min;
	res += tm->tm_sec;
	return res;
}
```

* leap year: ((year % 4 == 0) && (year % 100 > 0)) || (year % 400 == 0)

* 1972 is the first leap year after 1970, (year+1)/4 is the total leap year and if (year+2) % 4 == 0, the year is a leap year

* Actually it doesn't solve the 400 and 100 case, which is the 2000 year this method doesn't work.

---------

## kernel/sched.c

![sched01.png](imgs/sched01.png)
![sched02.png](imgs/sched02.png)
![sched03.png](imgs/sched03.png)

----------------

```c
static void show_task(int nr,struct task_struct * p)
{
	int i,j = 4096-sizeof(struct task_struct);

	printk("%d: pid=%d, state=%d, father=%d, child=%d, ",(p == current)?-nr:nr,p->pid,
		p->state, p->p_pptr->pid, p->p_cptr ? p->p_cptr->pid : -1);
	i=0;
	while (i<j && !((char *)(p+1))[i])
		i++;
	printk("%d/%d chars free in kstack\n\r",i,j);
	printk("   PC=%08X.", *(1019 + (unsigned long *) p));
	if (p->p_ysptr || p->p_osptr) 
		printk("   Younger sib=%d, older sib=%d\n\r", 
			p->p_ysptr ? p->p_ysptr->pid : -1,
			p->p_osptr ? p->p_osptr->pid : -1);
	else
		printk("\n\r");
}
```

![sched04.png](imgs/sched04.png)
![sched05.png](imgs/sched05.png)
![sched06.png](imgs/sched06.png)
![sched07.png](imgs/sched07.png)
![sched08.png](imgs/sched08.png)

* For process, the task struct and process's kernel stack in the same page.

|task struct|         kernel stack           |
|<- low address      4KB      high address ->|

so `4096 - sizeof(struct task_struct)` is the max kernel stack size. And the kernel stack top point move from high address to low address. `while(i<j && !((char*)(p+1))[i])` is used for checking the number of zero bytes after the task struct, which is the free stack size.

-------------

```c
union task_union {
	struct task_struct task;
	char stack[PAGE_SIZE];
};
```

* task_struct and kernel stack are in the same page with a page size = 4KB. 

--------

```c
unsigned long volatile jiffies=0;
unsigned long startup_time=0;
int jiffies_offset = 0;		/* # clock ticks to add to get "true
				   time".  Should always be less than
				   1 second's worth.  For time fanatics
				   who like to syncronize their machines
				   to WWV :-) */

```

![sched09.png](imgs/sched09.png)

---------

```c
struct task_struct *current = &(init_task.task);
struct task_struct *last_task_used_math = NULL;

struct task_struct * task[NR_TASKS] = {&(init_task.task), };

long user_stack [ PAGE_SIZE>>2 ] ;

struct {
	long * a;
	short b;
	} stack_start = { & user_stack [PAGE_SIZE>>2] , 0x10 };
```

![sched10.png](imgs/sched10.png)

-----------

```c
static struct timer_list {
	long jiffies;
	void (*fn)();
	struct timer_list * next;
} timer_list[TIME_REQUESTS] = { { 0, NULL, NULL }, };

static struct timer_list * next_timer = NULL;

void add_timer(long jiffies, void (*fn)(void))
{
	struct timer_list * p;

	if (!fn)
		return;
	cli();
	if (jiffies <= 0)
		(fn)();
	else {
		for (p = timer_list ; p < timer_list + TIME_REQUESTS ; p++)
			if (!p->fn)
				break;
		if (p >= timer_list + TIME_REQUESTS)
			panic("No more time requests free");
		p->fn = fn;
		p->jiffies = jiffies;
		p->next = next_timer;
		next_timer = p;
		while (p->next && p->next->jiffies < p->jiffies) {
			p->jiffies -= p->next->jiffies;
			fn = p->fn;
			p->fn = p->next->fn;
			p->next->fn = fn;
			jiffies = p->jiffies;
			p->jiffies = p->next->jiffies;
			p->next->jiffies = jiffies;
			p = p->next;
		}
	}
	sti();
}
```

* timer list operator and sort. naive codes.

--------------